In [None]:
import nltk
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans, MiniBatchKMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [None]:
kaggle = False

In [None]:
filepath = 'data/reviews.csv' if not kaggle else './kaggle/input/consumer-review-of-clothing-product/Consumer Review of Clothing Product/data_amazon.xlsx - Sheet1.csv'

df = pd.read_csv(filepath)

display(df)

In [None]:
nulls = pd.DataFrame(df.isnull().sum(), columns = ['Nulls'])

display(nulls.T)

In [None]:
clean_df = df.dropna(subset = ['Title', 'Review']).drop(columns = ['Materials', 'Construction', 'Color', 'Finishing', 'Durability'])

display(clean_df)

In [None]:
classes = df['Cloth_class'].value_counts()

sns.barplot(y = classes.index, x = classes.values, palette = 'viridis', orient = 'h')

plt.show()

In [None]:
def text_feature_extract(text_feature, model):
    text = model.fit_transform(text_feature)
        
    return text

In [None]:
stopwords = nltk.corpus.stopwords.words("english")

In [None]:
count_vect0 = CountVectorizer(stop_words = stopwords)

In [None]:
text_title = text_feature_extract(clean_df['Title'], count_vect0)

print(f"Number of words = {len(count_vect0.get_feature_names_out())}")

In [None]:
text_review = text_feature_extract(clean_df['Review'], count_vect0)

print(f"Number of words = {len(count_vect0.get_feature_names_out())}")

In [None]:
tfidf_vect0 = TfidfVectorizer(stop_words = stopwords, min_df = 4, max_df = 5000)

In [None]:
text_title = text_feature_extract(clean_df['Title'], tfidf_vect0)

print(f"Number of words = {len(tfidf_vect0.get_feature_names_out())}")

In [None]:
text_review = text_feature_extract(clean_df['Review'], tfidf_vect0)

print(f"Number of words = {len(tfidf_vect0.get_feature_names_out())}")

In [None]:
text_all = text_feature_extract(clean_df['Title'] + ' ' + clean_df['Review'], tfidf_vect0)

print(f"Number of words = {len(tfidf_vect0.get_feature_names_out())}")

In [None]:
inertias = list()
ranges = list(range(7, 13))

kmeans_models = dict()

for n in ranges:
    kmeans = KMeans(n_clusters = n, n_init = 124)
    
    kmeans.fit(text_all)
    
    inertias.append(kmeans.inertia_)
    
    kmeans_models[n] = kmeans

In [None]:
sns.lineplot(x = ranges, y = inertias, marker = 'x', color = 'Green')

plt.show()

In [None]:
kmeans12 = KMeans(n_clusters = 12, n_init = 304, random_state = 1)

kmeans12.fit(text_all)

In [None]:
cluster_df = clean_df.copy()

cluster_df['cluster'] = kmeans12.labels_

display(cluster_df)

In [None]:
total_cluster = cluster_df['cluster'].value_counts()

sns.barplot(y = total_cluster.index, x = total_cluster.values, palette = 'viridis', orient = 'h')

plt.show()

In [None]:
cluster_cloth = pd.DataFrame(cluster_df.groupby(['cluster', 'Cloth_class'])[['cluster', 'Cloth_class']].value_counts()).reset_index().sort_values(by = 'Cloth_class')

display(cluster_cloth)

In [None]:
fig, axes = plt.subplots(3, 4, figsize = (32, 30))

for cluster in range(12):
    df_cluster_n = cluster_cloth[cluster_cloth['cluster'] == cluster]
    
    sns.barplot(
                y = df_cluster_n['Cloth_class'], 
                x = df_cluster_n['count'], 
                orient = 'h', 
                ax = axes[cluster//4][cluster%4], 
                palette = 'viridis'
               ).set_title(f"Cluster {cluster}")
    
plt.show()

In [None]:
fig, axes = plt.subplots(8, 3, figsize = (16, 40))

for idx, cloth in enumerate(cluster_cloth['Cloth_class'].unique()):
    df_cluster_n = cluster_cloth[cluster_cloth['Cloth_class'] == cloth]
    
    sns.barplot(
                y = df_cluster_n['cluster'], 
                x = df_cluster_n['count'], 
                orient = 'h', 
                ax = axes[idx//3][idx%3],
                palette = 'viridis'
               ).set_title(f"{cloth}")
    
plt.show()

In [None]:
agg_cluster = AgglomerativeClustering(n_clusters = 12)

agg_cluster.fit(text_all.toarray())

In [None]:
agg_cluster_df = clean_df.copy()

agg_cluster_df['cluster'] = agg_cluster.labels_

display(agg_cluster_df)

In [None]:
total_agg_cluster = agg_cluster_df['cluster'].value_counts()

sns.barplot(y = total_agg_cluster.index, x = total_agg_cluster.values, palette = 'viridis', orient = 'h')

plt.show()

In [None]:
agg_cluster_cloth = pd.DataFrame(agg_cluster_df.groupby(['cluster', 'Cloth_class'])[['cluster', 'Cloth_class']].value_counts()).reset_index().sort_values(by = 'Cloth_class')

display(agg_cluster_cloth)

In [None]:
fig, axes = plt.subplots(3, 4, figsize = (32, 30))

for cluster in range(12):
    df_cluster_n = agg_cluster_cloth[agg_cluster_cloth['cluster'] == cluster]
    
    sns.barplot(
                y = df_cluster_n['Cloth_class'], 
                x = df_cluster_n['count'], 
                orient = 'h', 
                ax = axes[cluster//4][cluster%4], 
                palette = 'viridis'
               ).set_title(f"Cluster {cluster}")
    
plt.show()

In [None]:
fig, axes = plt.subplots(8, 3, figsize = (16, 40))

for idx, cloth in enumerate(agg_cluster_cloth['Cloth_class'].unique()):
    df_cluster_n = agg_cluster_cloth[agg_cluster_cloth['Cloth_class'] == cloth]
    
    sns.barplot(
                y = df_cluster_n['cluster'], 
                x = df_cluster_n['count'], 
                orient = 'h', 
                ax = axes[idx//3][idx%3],
                palette = 'viridis'
               ).set_title(f"{cloth}")
    
plt.show()

In [None]:
dbscan = DBSCAN(min_samples = 100, n_jobs = 4, eps = 1, leaf_size = 400)

dbscan.fit(text_all)

In [None]:
dbscan_df = clean_df.copy()

dbscan_df['cluster'] = dbscan.labels_

display(dbscan_df)

In [None]:
dbscan_cloth = pd.DataFrame(dbscan_df.groupby(['cluster', 'Cloth_class'])[['cluster', 'Cloth_class']].value_counts()).reset_index().sort_values(by = 'Cloth_class')

display(dbscan_cloth)

In [None]:
print(f"Number of clusters in dbscan = {len(dbscan_cloth['cluster'].unique())}")

In [None]:
fig, axes = plt.subplots(8, 3, figsize = (16, 40))

for idx, cloth in enumerate(dbscan_cloth['Cloth_class'].unique()):
    df_cluster_n = dbscan_cloth[dbscan_cloth['Cloth_class'] == cloth]
    
    sns.barplot(
                y = df_cluster_n['cluster'], 
                x = df_cluster_n['count'], 
                orient = 'h', 
                ax = axes[idx//3][idx%3],
                palette = 'viridis'
               ).set_title(f"{cloth}")
    
plt.show()

In [None]:
mini_batch_kmeans = MiniBatchKMeans(n_clusters = 12, n_init = 504, random_state = 1)

mini_batch_kmeans.fit(text_all)

In [None]:
mini_batch_df = clean_df.copy()

mini_batch_df['cluster'] = mini_batch_kmeans.labels_

display(mini_batch_df)

In [None]:
total_mini_batch = mini_batch_df['cluster'].value_counts()

sns.barplot(y = total_mini_batch.index, x = total_mini_batch.values, palette = 'viridis', orient = 'h')

plt.show()

In [None]:
gaussian_mix = GaussianMixture(n_components = 12, n_init = 14)

gaussian_mix.fit(text_all.toarray())

In [None]:
def cluster_results(cluster_model, text, df):
    cluster_model.fit(text)
    
    model_df = df.copy()
    
    model_df['cluster'] = cluster_model.labels_
    
    total_cluster = model_df['cluster'].value_counts()
    
    model_cloth = pd.DataFrame(model_df.groupby(['cluster', 'Cloth_class'])[['cluster', 'Cloth_class']].value_counts()).reset_index().sort_values(by = 'Cloth_class')
    
    return model_df, total_cluster, model_cloth

def plot_cloths_cluster(model_cloth):
    fig, axes = plt.subplots(8, 3, figsize = (16, 40))

    for idx, cloth in enumerate(model_cloth['Cloth_class'].unique()):
        df_cluster_n = model_cloth[model_cloth['Cloth_class'] == cloth]

        sns.barplot(
                    y = df_cluster_n['cluster'], 
                    x = df_cluster_n['count'], 
                    orient = 'h', 
                    ax = axes[idx//3][idx%3],
                    palette = 'viridis'
                   ).set_title(f"{cloth}")

    plt.show()

In [None]:
def redefine_class(x):
    if x in {'Legwear', 'Casual Bottoms', 't'}: return 'Pants'
    elif x in {'Layering', ''}: return ''
    elif x in {'Suits'}: return 'Blazer'
    elif x in {'Intimates'}: return 'Sleep'
    elif x in {'Skirts', 'Dress'}: return 'Dresses'
    elif x in {'Fine gauge'}: return 'Knits'
    else: return x