In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Define function to perform clustering and plot results
def perform_clustering(csv_file_path, num_clusters=3):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # Display the first few rows of the dataframe
    print("First 5 rows of the dataset:\n", df.head())

    # Check if there are any missing values
    if df.isnull().sum().sum() > 0:
        print("The dataset contains missing values. Filling missing values with the mean.")
        df.fillna(df.mean(), inplace=True)

    # Standardize the data (since clustering can be sensitive to different feature scales)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(scaled_data)

    # Add cluster labels to the original dataset
    df['Cluster'] = kmeans.labels_

    # Plotting the clusters (for the first two features, assuming numerical data)
    plt.scatter(scaled_data[:, 0], scaled_data[:, 1], c=kmeans.labels_, cmap='viridis')
    plt.title('Clustering results')
    plt.xlabel('Feature 1 (Standardized)')
    plt.ylabel('Feature 2 (Standardized)')
    plt.colorbar(label='Cluster')
    plt.show()

    # Return the dataframe with the cluster labels
    return df

In [None]:
def perform_clustering(csv_file_path, num_clusters=3):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # Display the first few rows of the dataframe
    print("First 5 rows of the dataset:\n", df.head())

    # Select only numeric columns
    df_numeric = df.select_dtypes(include=['float64', 'int64'])

    # Check if there are any missing values in numeric data
    if df_numeric.isnull().sum().sum() > 0:
        print("The dataset contains missing values. Filling missing values with the mean.")
        df_numeric.fillna(df_numeric.mean(), inplace=True)

    # Standardize the data (since clustering can be sensitive to different feature scales)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df_numeric)

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(scaled_data)

    # Add cluster labels to the original dataset (df_numeric)
    df_numeric['Cluster'] = kmeans.labels_

    # Plotting the clusters (for the first two features)
    plt.scatter(scaled_data[:, 0], scaled_data[:, 1], c=kmeans.labels_, cmap='viridis')
    plt.title('Clustering results')
    plt.xlabel('Feature 1 (Standardized)')
    plt.ylabel('Feature 2 (Standardized)')
    plt.colorbar(label='Cluster')
    plt.show()

    # Return the dataframe with the cluster labels
    return df_numeric


In [None]:
pip install pandas scikit-learn matplotlib


In [None]:
df_with_clusters = perform_clustering('clearnose6.csv', num_clusters=4)


# New Section

In [None]:
# Save the DataFrame with cluster labels to a new CSV file
output_csv_file = 'clustered_output.csv'
df_with_clusters.to_csv(output_csv_file, index=False)
print(f"The clustered data has been saved to {output_csv_file}.")


In [None]:
def perform_clustering(csv_file_path, num_clusters=3):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # Display the first few rows of the dataframe
    print("First 5 rows of the dataset:\n", df.head())

    # Clean the Price column
    df['Price'] = df['Price'].str.extract('(\d+)').astype(float)

    # Select only numeric columns
    df_numeric = df.select_dtypes(include=['float64', 'int64'])

    # Check if there are any missing values in numeric data
    if df_numeric.isnull().sum().sum() > 0:
        print("The dataset contains missing values. Filling missing values with the mean.")
        df_numeric.fillna(df_numeric.mean(), inplace=True)

    # Standardize the data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df_numeric)

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(scaled_data)

    # Add cluster labels to the original dataset (df_numeric)
    df_numeric['Cluster'] = kmeans.labels_

    # Plotting the clusters
    plt.scatter(scaled_data[:, 0], scaled_data[:, 1], c=kmeans.labels_, cmap='viridis')
    plt.title('Clustering results')
    plt.xlabel('Feature 1 (Standardized)')
    plt.ylabel('Feature 2 (Standardized)')
    plt.colorbar(label='Cluster')
    plt.show()

    # Return the dataframe with the cluster labels and the scaled data
    return df_numeric, scaled_data


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # For 3D plotting

def perform_clustering(csv_file_path, num_clusters=3):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # Display the first few rows of the dataset
    print("First 5 rows of the dataset:\n", df.head())

    # Clean the Price column
    df['Price'] = df['Price'].astype(str)

    # Extract numeric values from the Price column
    df['Price'] = df['Price'].str.extract('(\d+)').astype(float)

    # Select only numeric columns (Discount, Reviews_rate, Price)
    df_numeric = df.select_dtypes(include=['float64', 'int64'])

    # Check if there are any missing values in numeric data
    if df_numeric.isnull().sum().sum() > 0:
        print("The dataset contains missing values. Filling missing values with the mean.")
        df_numeric.fillna(df_numeric.mean(), inplace=True)

    # Standardize the data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df_numeric)

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(scaled_data)

    # Add cluster labels to the original dataset (df_numeric)
    df_numeric['Cluster'] = kmeans.labels_

    # Force a 3D Plotting using duplicated features if there are fewer than 3 features
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')

    # If fewer than 3 features, duplicate the second feature for 3D plotting
    if scaled_data.shape[1] < 3:
        ax.scatter(scaled_data[:, 0], scaled_data[:, 1], scaled_data[:, 1],  # Duplicated the second feature
                   c=kmeans.labels_, cmap='viridis', s=50)
        ax.set_zlabel('Feature 2 (Duplicated)')
    else:
        # Plot the actual first three features if available
        ax.scatter(scaled_data[:, 0], scaled_data[:, 1], scaled_data[:, 2],
                   c=kmeans.labels_, cmap='viridis', s=50)
        ax.set_zlabel('Feature 3 (Standardized)')

    ax.set_title('3D Clustering results')
    ax.set_xlabel('Feature 1 (Standardized)')
    ax.set_ylabel('Feature 2 (Standardized)')

    plt.colorbar(ax.scatter(scaled_data[:, 0], scaled_data[:, 1], scaled_data[:, 1],
                            c=kmeans.labels_, cmap='viridis'))
    plt.show()

    # Return the dataframe with the cluster labels and the scaled data
    return df_numeric, scaled_data

In [None]:
df_with_clusters, scaled_data = perform_clustering('clearnose6.csv', num_clusters=6)

In [None]:
from sklearn.metrics import silhouette_score

# Suppose num_clusters was used in your perform_clustering function.
num_clusters = 4  # If not defined earlier, define the number of clusters here.

# Calculate silhouette score
sil_score = silhouette_score(scaled_data, df_with_clusters['Cluster'])
print(f'Silhouette Score for {num_clusters} clusters: {sil_score:.2f}')


In [None]:
import pandas as pd

# โหลดข้อมูลจาก CSV
df = pd.read_csv('clearnose6.csv')


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('clearnose6.csv')
df

In [None]:
import matplotlib

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
import pandas as pd
from sklearn.model_selection import train_test_split

# Load CSV and columns


# Import label encoder
from sklearn import preprocessing

# label_encoder object knows
# how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Encode labels in column 'species'.
df['Product_names']= label_encoder.fit_transform(df['Product_names'])


In [None]:
df.dropna()

In [None]:
df2 = df.iloc[12:14]
df2

In [None]:
df1=df.iloc[0:11]

In [None]:
df3 = pd.concat([df1,df2])

In [None]:
Y = df3['Price']
X = df3.drop(columns=['Price','Product_names'])

In [None]:
X

In [None]:


X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)

regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# plt.plot(X_test, regr.predict(X_test), color='red',linewidth=3)
# plt.show()

In [None]:
X_test

In [None]:
from sklearn import metrics

# Make predictions using the test set
y_pred = regr.predict(X_test)

# Compute R-squared score
r2_score = metrics.r2_score(y_test, y_pred)
print(f'R-squared score: {r2_score}')

# Compute Mean Squared Error
mse = metrics.mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


In [None]:
import pickle

# Assuming 'model' is your trained model
filename = 'model.pkl'

# Save the model to disk
with open(filename, 'wb') as file:
    pickle.dump(regr, file)