In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
data = pd.read_csv(r"data\phase1_results.csv")

In [3]:
train_data = data[['Category',
 'Price_Nos',
 'Ratings_Nos',
 'Upd_Brand_Names',
 'Glb_Ratings_Rev']].copy()

In [None]:
# apply one-hot encoding to all text columns in train_data dataframe using sklearn library and preprocessing module to apply to all columns in single statement
categorical_features = ['Category', 'Upd_Brand_Names']
numeric_features = ['Price_Nos', 'Ratings_Nos', 'Glb_Ratings_Rev']
# apply one-hot encoding to all text columns in train_data dataframe using sklearn library and preprocessing module to apply to all columns in single statement
categorical_features = ['Category', 'Upd_Brand_Names']
onehot_encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', onehot_encoder, categorical_features),
        ('num', numeric_transformer, numeric_features)])

In [None]:
# Create a pipeline with preprocessing and KMeans
kmeans_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('kmeans', KMeans(n_clusters=3, random_state=42))  # Adjust n_clusters as needed
])

In [None]:
# Fit the pipeline
kmeans_pipeline.fit(train_data)

In [None]:
# provide the cluster labels for each data point
cluster_labels = kmeans_pipeline.named_steps['kmeans'].labels_

In [None]:
# calculate the silhouette score to evaluate the clustering performance
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(preprocessor.transform(train_data), cluster_labels)
print(f"Silhouette Score: {silhouette_avg}")

In [None]:
# I would like to create a function that carries out the above operations and run it multiple times with different values of n and create a plot of silhouette scores against n values
def evaluate_kmeans_clustering(data, n_clusters_list):
    silhouette_scores = {}
    for i in n_clusters_list:
        kmeans_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('kmeans', KMeans(n_clusters=i, random_state=42))
        ])
        kmeans_pipeline.fit(data)
        cluster_labels = kmeans_pipeline.named_steps['kmeans'].labels_
        silhouette_avg = silhouette_score(preprocessor.transform(data), cluster_labels)
        silhouette_scores[i] = silhouette_avg
    return silhouette_scores

In [None]:
# list of n values to test
n_values = list(range(3, 11))

In [None]:
# run the evaluate_kmeans_clustering function
silhouette_results = evaluate_kmeans_clustering(train_data, n_values)

In [None]:
# convert the silhouette_results dictionary to a dataframe for easy plotting
silhouette_df = pd.DataFrame(list(silhouette_results.items()), columns=['n_clusters', 'silhouette_score'])

In [None]:
# Create the final pipeline with preprocessing and KMeans
fnl_kmeans_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('kmeans', KMeans(n_clusters=4, random_state=42))  # Adjust n_clusters as needed
])
# Fit the pipeline
fnl_kmeans_pipeline.fit(train_data)
# provide the cluster labels for each data point
fnl_cluster_labels = fnl_kmeans_pipeline.named_steps['kmeans'].labels_

In [None]:
# apply the retrieved labels to the original dataframe as a new column
data['KMeans_Cluster_Labels'] = fnl_cluster_labels