In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
from sklearn.cluster import KMeans, DBSCAN , AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import pandas as pd
import numpy as np

2023-12-01 14:50:50.142149: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-01 14:50:50.142181: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-01 14:50:50.143725: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-01 14:50:50.150946: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
kaggle = False

In [3]:
filepath = 'data/reviews.csv' if not kaggle else './kaggle/input/consumer-review-of-clothing-product/Consumer Review of Clothing Product/data_amazon.xlsx - Sheet1.csv'

df = pd.read_csv(filepath)

display(df)

Unnamed: 0,Title,Review,Cons_rating,Cloth_class,Materials,Construction,Color,Finishing,Durability
0,,Absolutely wonderful - silky and sexy and comf...,4.0,Intimates,0.0,0.0,0.0,1.0,0.0
1,,Love this dress! it's sooo pretty. i happene...,5.0,Dresses,0.0,1.0,0.0,0.0,0.0
2,Some major design flaws,I had such high hopes for this dress and reall...,3.0,Dresses,0.0,0.0,0.0,1.0,0.0
3,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5.0,Pants,0.0,0.0,0.0,0.0,0.0
4,Flattering shirt,This shirt is very flattering to all due to th...,5.0,Blouses,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
49333,Dress felt and fit great. I got lots of compl...,Loved the color!!! Dress fit great and I got ...,5.0,Dresses,0.0,0.0,1.0,0.0,0.0
49334,Loved the dress but poor quality,This dress looked great and I loved the materi...,2.0,Dresses,1.0,0.0,0.0,0.0,1.0
49335,"Cute dress, didn't fit",Wanted this dress to work it didn't. It is ver...,1.0,Dresses,0.0,1.0,0.0,0.0,0.0
49336,Very cute!,No complaints othe than the zipper gets stuck ...,4.0,Dresses,0.0,0.0,0.0,0.0,1.0


In [4]:
clean_df = df.dropna(subset = ['Title', 'Review']).drop(columns = ['Materials', 'Construction', 'Color', 'Finishing', 'Durability'])
clean_df.head()

Unnamed: 0,Title,Review,Cons_rating,Cloth_class
2,Some major design flaws,I had such high hopes for this dress and reall...,3.0,Dresses
3,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5.0,Pants
4,Flattering shirt,This shirt is very flattering to all due to th...,5.0,Blouses
5,Not for the very petite,"I love tracy reese dresses, but this one is no...",2.0,Dresses
6,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5.0,Knits


In [5]:
text_features = clean_df[['Review', 'Cloth_class','Title']].copy()
text_features

Unnamed: 0,Review,Cloth_class,Title
2,I had such high hopes for this dress and reall...,Dresses,Some major design flaws
3,"I love, love, love this jumpsuit. it's fun, fl...",Pants,My favorite buy!
4,This shirt is very flattering to all due to th...,Blouses,Flattering shirt
5,"I love tracy reese dresses, but this one is no...",Dresses,Not for the very petite
6,I aded this in my basket at hte last mintue to...,Knits,Cagrcoal shimmer fun
...,...,...,...
49333,Loved the color!!! Dress fit great and I got ...,Dresses,Dress felt and fit great. I got lots of compl...
49334,This dress looked great and I loved the materi...,Dresses,Loved the dress but poor quality
49335,Wanted this dress to work it didn't. It is ver...,Dresses,"Cute dress, didn't fit"
49336,No complaints othe than the zipper gets stuck ...,Dresses,Very cute!


In [6]:
def get_optimal_number_of_clusters(data):
    silhouette_scores = []

    for k in range(2, 16):
        print(f"Trying {k} clusters")
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(data)
        labels = kmeans.predict(data)
        silhouette_score_current = silhouette_score(data, labels)
        silhouette_scores.append(silhouette_score_current)

    optimal_number_of_clusters = silhouette_scores.index(max(silhouette_scores)) + 2

    return [optimal_number_of_clusters]



def clusters(X):
    
    print("Running DBSCAN...")
    dbscan = DBSCAN(eps=0.3, min_samples=10).fit(X)
    clusters = pd.DataFrame({'dbscan': dbscan.labels_})
    n_clusters = get_optimal_number_of_clusters(X)
    for cluster in n_clusters:
        print(f"Running Kmeans for {cluster} clusters...")
        kmeans = KMeans(n_clusters=cluster, init="k-means++", random_state=0).fit(X)
        clusters_df = pd.DataFrame({f'kmeans_c{cluster}': kmeans.labels_})
        
    clusters = pd.concat([clusters, clusters_df ], axis=1)
    
    return clusters

def tfidf(text):
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(text)
    return X

def create_clusters_for_data(text):
    vectorizes = [tfidf]
    clustered_data_frames = pd.DataFrame()
    for vec in vectorizes:
        clustered_data_frames = pd.concat([clustered_data_frames,clusters(vec(text))], axis=1)
        
    
    return clustered_data_frames


In [7]:
clusters_tfidf_reviews = create_clusters_for_data(text_features['Review'])
clusters_tfidf_reviews

Running DBSCAN...
Trying 2 clusters


  super()._check_params_vs_input(X, default_n_init=10)


Trying 3 clusters


  super()._check_params_vs_input(X, default_n_init=10)


Trying 4 clusters


  super()._check_params_vs_input(X, default_n_init=10)


Trying 5 clusters


  super()._check_params_vs_input(X, default_n_init=10)


Trying 6 clusters


  super()._check_params_vs_input(X, default_n_init=10)


Trying 7 clusters


  super()._check_params_vs_input(X, default_n_init=10)


Trying 8 clusters


  super()._check_params_vs_input(X, default_n_init=10)


Trying 9 clusters


  super()._check_params_vs_input(X, default_n_init=10)


Trying 10 clusters


  super()._check_params_vs_input(X, default_n_init=10)


Trying 11 clusters


  super()._check_params_vs_input(X, default_n_init=10)


Trying 12 clusters


  super()._check_params_vs_input(X, default_n_init=10)


Trying 13 clusters


  super()._check_params_vs_input(X, default_n_init=10)


Trying 14 clusters


  super()._check_params_vs_input(X, default_n_init=10)


Trying 15 clusters


  super()._check_params_vs_input(X, default_n_init=10)


Running Kmeans for 15 clusters...


  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,dbscan,kmeans_c15
0,-1,5
1,-1,8
2,-1,3
3,-1,1
4,-1,10
...,...,...
45316,-1,1
45317,-1,1
45318,-1,1
45319,-1,10


In [9]:
clusters_tfidf_reviews.to_csv('clusters_tfidf_reviews.csv')