https://spotintelligence.com/2023/02/15/word2vec-for-text-classification/#Text_classification_using_Word2Vec_Python

## Install required packages

In [None]:
# !pip install psycopg2-binary
# referencing code samples in https://towardsdatascience.com/elbow-method-is-not-sufficient-to-find-best-k-in-k-means-clustering-fc820da0631d#:~:text=The%20elbow%20method%20is%20a,cluster%20and%20the%20cluster%20centroid.
!pip --version

!pip install -r requirements.txt

## Do general imports

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import math as math
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.stats import f_oneway
from scipy.stats import kruskal
from gensim.models import Word2Vec
from sklearn.decomposition import PCA


pd.set_option('display.max_columns', None)

In [None]:
a = np.array([[1,1,1],[1,2,2],[1,3,3]])
a.mean(axis=0)

In [None]:
max_clusters = 40
vector_size = 300
author_role = 'others'

def vectorize(sentence, w2v_model):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(vector_size)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

## Load Datasets

In [None]:
utterances_df = pd.read_csv('./temp_data/pp_utterances.csv')
utterances_df = utterances_df[utterances_df['author_role'] == author_role]
utterances_df = utterances_df[~((utterances_df['author'].isin(['admin', 'u003']))
                                    & (utterances_df['comment_seq'] < 3)
                                    & (utterances_df['author_role'] == 'others'))]
print(len(utterances_df))
utterances_df.head(2)

In [None]:
text = utterances_df['pp_actionbody'].astype(str)

sentences = [str(sentence).split() for sentence in text]
print('train model')
w2v_model = Word2Vec(sentences, vector_size=vector_size, window=5, min_count=5, workers=1,seed=42)
print('word2vec model built')

In [None]:
f_names = [f'f{i}' for i in range(0,vector_size)]
rows = []

for t in text:
    rows.append(vectorize(t, w2v_model))

df = pd.DataFrame(columns=f_names, data = rows)
print(len(df))

In [None]:
df.head(1)

In [None]:
km = KMeans(n_init="auto",random_state=42)

visualizer = KElbowVisualizer(km, k=(2,max_clusters+1),timings=False)
visualizer.ax.set_xlabel('k',fontsize=14) 
visualizer.ax.set_ylabel('distortion score',fontsize=14) 

visualizer.fit(df)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

In [None]:
print(f'best k is {visualizer.elbow_value_}')
km = KMeans(n_clusters=visualizer.elbow_value_, n_init='auto', random_state=42)
km.fit(df)

# pca = PCA(n_components=2)
# pca = pca.fit_transform(df.to_numpy())
# pca_df = pd.DataFrame(columns=['pc1','pc2'],data=pca)

# pca_df['label'] = km.labels_
# # pca_df.head(3)

In [None]:
# pca_df['label'].value_counts()

In [None]:
# fig = plt.figure(figsize=(8,8))
# ax = fig.add_subplot(111)
# for c in pca_df['label'].drop_duplicates():
#     pca_l = pca_df[pca_df['label'] == c]
#     ax.scatter(pca_l['pc1'],pca_l['pc2'],label=f'{c}-{len(pca_l)}')
# ax.legend()

In [None]:
from pathlib import Path

utterances_df['label'] = km.labels_
for c in utterances_df['label'].drop_duplicates():
    c_df = utterances_df[utterances_df['label'] == c]
    Path(f'./temp_data/{author_role}/{vector_size}').mkdir(exist_ok=True,parents=True)
    c_df.to_csv(f'./temp_data/{author_role}/{vector_size}/pp_utterances_{c}.csv')
print('saved')

In [None]:
# for i in range(2,max_clusters + 1):
#     km = KMeans(n_clusters=i, n_init='auto', random_state=42)
#     q, mod = divmod(i, 3)
#     fig = plt.figure(figsize=(6,2))
#     ax = fig.add_subplot(1,1,1)
#     ax.set_ylabel('Size')
#     ax.set_xlabel(f'Score')
#     ax.set_title(f'{i} clusters')
#     visualizer = SilhouetteVisualizer(km,colors=sns.color_palette("tab10"), ax=ax)
#     visualizer.fit(df)

In [None]:
# km = KMeans(n_clusters=4, n_init='auto', random_state=42)
# km.fit(df)

# df_pp = issues_df[columns].copy()
# df_pp.loc[:,'category'] = km.labels_

