In [None]:
!pip install -q openai

In [None]:
import nltk
import pandas as pd
import seaborn as sns
import spacy
from matplotlib import pyplot as plt
from nltk.stem import SnowballStemmer
from openai import OpenAI
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans, MiniBatchKMeans
from sklearn.decomposition import LatentDirichletAllocation, PCA, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from wordcloud import WordCloud

In [None]:
with open('/kaggle/input/apikey/apikey.txt', 'r') as key_file:
    api_key = key_file.read()

In [None]:
filepath = "/kaggle/input/consumer-review-of-clothing-product/Consumer Review of Clothing Product/data_amazon.xlsx - Sheet1.csv"

df = pd.read_csv(filepath)

display(df)

In [None]:
nulls = pd.DataFrame(df.isnull().sum(), columns = ['Nulls'])

display(nulls.T)

In [None]:
classes = df['Cloth_class'].value_counts()

sns.barplot(y = classes.index, x = classes.values, palette = 'viridis', orient = 'h')

plt.show()

In [None]:
ratings = df['Cons_rating'].value_counts()

sns.barplot(x = ratings.index, y = ratings.values, palette = 'viridis')

plt.show()

In [None]:
clean_df = df.dropna(subset = ['Title', 'Review', 'Cloth_class']).drop(columns = ['Materials', 'Construction', 'Color', 'Finishing', 'Durability'])

display(clean_df)

In [None]:
fig, ax = plt.subplots(figsize = (10, 10))

words_per_review = clean_df['Review'].apply(lambda x: len(x.split()))

sns.histplot(words_per_review, kde = True, bins = 75, color = 'Green').set_title(f'Histogram of amount of words per review')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize = (10, 10))

words_per_title = clean_df['Title'].apply(lambda x: len(x.split()))

sns.histplot(words_per_title, bins = 25, color = 'DarkGreen').set_title(f'Histogram of amount of words per title')

plt.show()

In [None]:
review_and_title = clean_df['Review'] + ' ' + clean_df['Title']

In [None]:
def text_feature_extract(text_feature, model):
    text = model.fit_transform(text_feature)
        
    return text

In [None]:
def cluster_results(cluster_model, text, df):
    cluster_model.fit(text)
    
    model_df = df.copy()
    
    model_df['cluster'] = cluster_model.labels_
    
    total_cluster = model_df['cluster'].value_counts()
    
    model_cloth = pd.DataFrame(model_df.groupby(['cluster', 'Cloth_class'])[['cluster', 'Cloth_class']].value_counts()).reset_index().sort_values(by = 'Cloth_class')
    
    return model_df, total_cluster, model_cloth

def plot_total_cluster(total_cluster):
    sns.barplot(x = total_cluster.index, y = total_cluster.values, palette = 'viridis')
    
    plt.show()

def plot_cloth_cluster(model_cloth):
    fig, axes = plt.subplots(6, 4, figsize = (32, 30))

    for idx, cloth in enumerate(model_cloth['Cloth_class'].unique()):
        df_cluster_n = model_cloth[model_cloth['Cloth_class'] == cloth]

        sns.barplot(
                    y = df_cluster_n['cluster'], 
                    x = df_cluster_n['count'], 
                    orient = 'h', 
                    ax = axes[idx // 4][idx % 4],
                    palette = 'viridis'
                   ).set_title(f"{cloth}")

    plt.show()

In [None]:
words = "".join(word for word in review_and_title.values)

In [None]:
fig, ax = plt.subplots(figsize = (15, 10))

len_of_words = pd.DataFrame(map(lambda x: len(x), words.split()), columns = ['Len'])

len_of_words = len_of_words.groupby(['Len'])[['Len']].value_counts()

sns.barplot(x = len_of_words.index, y = len_of_words.values, color = 'DarkCyan', ax = ax).set_title("Lenght of words")

plt.show()

In [None]:
len_of_words_df = pd.DataFrame({'Word': words.split(), 'Len': map(lambda x: len(x), words.split())}).drop_duplicates()

display(len_of_words_df.sort_values(by = 'Len', ascending = False))

In [None]:
fig, ax = plt.subplots(figsize = (15, 15))

word_cloud = WordCloud(background_color = 'black').generate(words)

plt.imshow(word_cloud)
plt.axis("off")
plt.show()

In [None]:
words_qtd = dict()

for word in words.split():
    if word in words_qtd.keys():
        words_qtd[word] += 1
    else:
        words_qtd[word] = 1
        
most_frequent = list(zip(words_qtd.keys(), words_qtd.values()))

most_frequent.sort(key = lambda x: x[1], reverse = True)

n_freq = 25

fig, ax = plt.subplots(figsize = (10, 10))

sns.barplot(x = [x[1] for x in most_frequent][:n_freq], 
            y = [x[0] for x in most_frequent][:n_freq], 
            palette = 'viridis',
            orient = 'h'
           )

plt.show()

In [None]:
languages = {'english', 'spanish', 'portuguese'}

stopwords = list()

for lang in languages:
    stopwords.extend(nltk.corpus.stopwords.words(lang))

In [None]:
most_freq_non_sw = list(filter(lambda x: x[0] not in stopwords, most_frequent))

n_freq = 30

fig, ax = plt.subplots(figsize = (10, 10))

sns.barplot(x = [x[1] for x in most_freq_non_sw][:n_freq], 
            y = [x[0] for x in most_freq_non_sw][:n_freq], 
            palette = 'viridis',
            orient = 'h'
           )

plt.show()

In [None]:
sw = stopwords #+ list(len_of_words_df.query(f'Len == {1}')['Word']) 

In [None]:
stemmer = SnowballStemmer('english')

stemmed = review_and_title.str.split().apply(lambda x: [stemmer.stem(y) for y in x])

stemmed = stemmed.apply(lambda x: ' '.join(x))

In [None]:
count_vect0 = CountVectorizer(stop_words = sw, min_df = 4, max_df = 8000, ngram_range = (1, 3))

In [None]:
text_all = text_feature_extract(stemmed, count_vect0)

print(f"Number of words = {len(count_vect0.get_feature_names_out())}")

In [None]:
lda = LatentDirichletAllocation(n_components = 12, n_jobs = 4, random_state = 1)

topics = lda.fit_transform(text_all)

In [None]:
client = OpenAI(
    api_key = api_key
)

content = 'I want you to give a 1 to 5 word title to these 12 topic based on the group of words:'

In [None]:
n_top_words = 40

topic_dict = dict()

for idx, topic in enumerate(lda.components_):
    top_features_idx = topic.argsort()[-n_top_words:]
    top_features = count_vect0.get_feature_names_out()[top_features_idx]
    weights = topic[top_features_idx]
    
    topic_dict[idx] = {'top_features': top_features, 'weights': weights}
    
    content += f'Topic {idx+1}: {top_features}\n'

In [None]:
completion = client.chat.completions.create(
    model = 'gpt-3.5-turbo', 
    messages = [
        {'role': 'user', 
         'content': content}
    ],
    temperature = 0.1
)

titles = completion.choices[0].message.content.split('\n')

In [None]:
fig, axes = plt.subplots(6, 2, figsize = (18, 24))

for idx in topic_dict.keys():
    top_features = topic_dict[idx]['top_features'][::-1][:15]
    weights = topic_dict[idx]['weights'][::-1][:15]
    title = titles[idx]
    
    sns.barplot(x = weights, 
                y = top_features, 
                palette = 'viridis', 
                ax = axes[idx // 2][idx % 2]
               ).set_title(title)
    
plt.show()

In [None]:
fig, axes = plt.subplots(6, 2, figsize = (18, 24))

n_top_words = 15

for idx, topic in enumerate(lda.components_):
    top_features_idx = topic.argsort()[-n_top_words:]
    top_features = count_vect0.get_feature_names_out()[top_features_idx]
    weights = topic[top_features_idx]
    
    sns.barplot(x = weights[::-1], 
                y = top_features[::-1], 
                palette = 'viridis', 
                orient = 'h', 
                ax = axes[idx // 2][idx % 2]
               ).set_title(f'Topic {idx}')
    
plt.show()

In [None]:
tsvd = TruncatedSVD(n_components = 2)

reduction = tsvd.fit_transform(text_all)

In [None]:
all_classes = list(clean_df['Cloth_class'].unique())

plt.scatter(x = reduction[:, 0], y = reduction[:, 1], c = clean_df['Cloth_class'].apply(lambda x: all_classes.index(x)))

plt.legend()

plt.show()

In [None]:
inertias = list()
ranges = list(range(7, 16))

kmeans_models = dict()

for n in ranges:
    kmeans_n = KMeans(n_clusters = n, n_init = 124)
    
    kmeans_n.fit(text_all)
    
    inertias.append(kmeans_n.inertia_)
    
    kmeans_models[n] = kmeans_n

In [None]:
sns.lineplot(x = ranges, y = inertias, marker = 'x', color = 'Green')

plt.show()

In [None]:
kmeans = KMeans(n_clusters = 3, n_init = 514, random_state = 1, max_iter = 354)

kmeans_df, total_kmeans, kmeans_cloth = cluster_results(kmeans, text_all, clean_df)

display(kmeans_df)

In [None]:
plot_total_cluster(total_kmeans)

In [None]:
plot_cloth_cluster(kmeans_cloth)

In [None]:
kmeans = KMeans(n_clusters = 8, n_init = 514, random_state = 1, max_iter = 354)

kmeans_df, total_kmeans, kmeans_cloth = cluster_results(kmeans, reduction, clean_df)

display(kmeans_df)

In [None]:
plot_total_cluster(total_kmeans)

In [None]:
plot_cloth_cluster(kmeans_cloth)

In [None]:
agg_cluster = AgglomerativeClustering(n_clusters = 4)

agg_cluster_df, total_agg_cluster, agg_cluster_cloth = cluster_results(agg_cluster, text_all.toarray(), clean_df)

display(agg_cluster_df)

In [None]:
plot_total_cluster(total_agg_cluster)

In [None]:
plot_cloth_cluster(agg_cluster_cloth)

In [None]:
dbscan = DBSCAN(metric = 'cosine', n_jobs = 4, eps = 0.8)

dbscan_df, total_dbscan, dbscan_cloth = cluster_results(dbscan, text_all, clean_df)

display(dbscan_df)

In [None]:
plot_total_cluster(total_dbscan)

In [None]:
plot_cloth_cluster(dbscan_cloth)

In [None]:
mini_batch_kmeans = MiniBatchKMeans(n_clusters = 8, n_init = 504, random_state = 1)

mini_batch_df, total_mini_batch, mini_batch_cloth = cluster_results(mini_batch_kmeans, text_all, clean_df)

display(mini_batch_df)

In [None]:
plot_total_cluster(total_mini_batch)

In [None]:
plot_cloth_cluster(mini_batch_cloth)

In [None]:
def redefine_class(x):
    if x in {'Legwear', 'Casual Bottoms', 't'}: return 'Pants'
    elif x in {'Layering', ''}: return ''
    elif x in {'Suits'}: return 'Blazer'
    elif x in {'Intimates'}: return 'Sleep'
    elif x in {'Skirts', 'Dress'}: return 'Dresses'
    elif x in {'Fine gauge'}: return 'Knits'
    else: return x