In [None]:
#import libraries
import pandas as pd
import numpy as np
import csv
import ast
import re
from prettytable import PrettyTable

import time
import random

#data visualization libraries
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import plotly.io as pio
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator

#NLP & ML libraries
from gensim import corpora
from gensim.models import Word2Vec
from textblob import TextBlob
from nltk import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer, InterclusterDistance

from scipy.sparse import save_npz, load_npz


In [None]:
#set seed so that code output is deterministic
random.seed(20)  # Set the seed for Python's random module
np.random.seed(20)  # Set the seed for NumPy's random module

In [None]:
#import cleaned data

def list_converter(text):
    #to revert list->str conversion from pd.read_csv
    return ast.literal_eval(text)


data = pd.read_csv('Data/filtered_corpus.csv', converters ={'tokens':list_converter,
                                                           'updated_tokens': list_converter})

In [None]:
# Count the number of rows where the length of 'tokens' is less than 3
count = data[data['updated_tokens'].apply(lambda x: len(x) < 3)].shape[0]
indx = data[data['updated_tokens'].apply(lambda x: len(x) < 3)].index
print(f"Number of rows with less than 3 tokens: {count}")
print (indx[:5])

In [None]:
data = data.drop(columns = ['index'])
print (data.shape)
data.head()

In [None]:
#drop rows with less than 3 tokens
data = data.drop(indx, axis = 0).reset_index()

## **TFIDF MODEL**

In [None]:
#convert df['tokens'] to list of strings for CountVectorizer
corpus = data['updated_tokens'].apply(lambda token: ' '.join(token)).tolist()

corpus[:5]

#### **INSTANTIATE AND FIT TFIDF MODEL**

In [None]:
start_time = time.time() #track start time of execution

#instantiate CountVectorizer
tfidf = TfidfVectorizer(max_features = 5000,
                        max_df = 0.70,
                        min_df = 15,
                        ngram_range = (1,2))
   
#generate document vectors 
X_tfidf = tfidf.fit_transform(corpus)
print (f"The program took {time.time() - start_time:.2f} seconds to complete. The tfidf representation had {X_tfidf.shape[1]} features.")    

In [None]:
#get feature names 
tfidf_features = tfidf.get_feature_names_out()

In [None]:
#print(sorted(tfidf_features))

In [None]:
#tabular presentation of a sample of the bag of words representation
text_df = data['clean_text'].loc[8000:8004].copy()
tfidf_df = pd.DataFrame(X_tfidf[8000:8005].toarray(), columns = tfidf_features)

text_tfidf = tfidf_df.copy()
text_tfidf = text_tfidf.loc[:, (text_tfidf > 0.0).any()] #show only columns with at least a 1 entry for any word

text_tfidf.insert(0, 'clean_text', text_df.values) #include column of pre-processed text

text_tfidf

In [None]:
#visualize most common words in a wordcloud

#create tuple of word and a count of occurrence in whole corpus
word_scores = zip(tfidf_features, X_tfidf.sum(axis = 0).A1)

# Create a dictionary from the word_scores tuples
word_scores_dict = dict(word_scores)

#generate word cloud
wordcloud = WordCloud(background_color="white").generate_from_frequencies(word_scores_dict)

# Plot the word cloud
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#determine optimum k-clusters using yellowbrick visualizer

#i
kmeans = KMeans(init = 'k-means++', n_init = 10, random_state = 0)
visualizer = KElbowVisualizer(kmeans, k = (1,21))

visualizer.fit(X_tfidf.toarray())
visualizer.show()


In [None]:
sil_visualizer = SilhouetteVisualizer(
                    KMeans(n_clusters= 10, #visualizer.elbow_value_, 
                           init="k-means++", max_iter=300, n_init=10, random_state = 12), 
                    colors = 'yellowbrick')

sil_visualizer.fit (X_tfidf)
sil_visualizer.show()

In [None]:
true_k = 10 #visualizer.elbow_value_

model = KMeans(n_clusters=true_k, init="k-means++", max_iter=100, n_init=10, random_state = 10)

model.fit(X_tfidf.toarray())

order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_features

i = 0
for cluster in order_centroids:
    print (f'cluster {i}')
    for keyword in cluster[:10]:
        print (terms[keyword])
    print ("")
    i+=1

In [None]:
# Reduce the dimensionality of the document vectors to 2D
import plotly.express as px

pca = PCA(n_components=2)
doc_vectors_2d = pca.fit_transform(normalized_fc_doc_vectors)
docs = data['updated_tokens'].copy()