# Clustering

### Importing needed libraries and functions

In [None]:
import os
import re
import string
import pandas as pd
import numpy as np
import time

from imblearn.under_sampling import RandomUnderSampler

import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import WordPunctTokenizer

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

stop_words = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer() 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn import metrics

from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics import rand_score
%matplotlib qt
#%matplotlib inline

### defining preprocessing function

In [None]:
#Stop Words Download, run just once
#nltk.download('stopwords')

def preprocText(text):
    text = re.sub(r'\d', '', text)
    text = re.sub("([\(\[]).*?([\)\]])", "\g<1>\g<2>", text)
    text = text.translate(str.maketrans('','',string.punctuation)).lower()
    tokenized_text = WordPunctTokenizer().tokenize(text)

    ##REMOVING STOPWORDS
    tokenized_text_without_stopwords = []
    for token in tokenized_text:
        if token not in stop_words:
            token = lemmatizer.lemmatize(token)
            if(len(token)) > 2:
                tokenized_text_without_stopwords.append(token)
        
    return tokenized_text_without_stopwords

### loading dataset and filtering genres

In [None]:
df = pd.read_csv('final.csv')
genresToKeep = ['Drama','Comedy','Action','Horror','Documentary']
df = df[df['Genres'].isin(genresToKeep)]
df.Genres.value_counts()

### Undersampling

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

rus = RandomUnderSampler(replacement=False, random_state=1234)
X, y= rus.fit_sample(X, y)
y.value_counts()

### Applying preprocessing to extract final corpus

In [None]:
n = len(X)
corpus = [preprocText(item) for item in X.filmScript]

## FEATURE EXTRACTION

### Doc2Vec

In [None]:
%%time
import codecs

#parameters
model="doc2vec\doc2vec.bin"
#inference hyper-parameters
start_alpha=0.01
infer_epoch=5
#load model
m = Doc2Vec.load(model)

#INFER DOCUMENTS
Doc2Vect = []
for i in range(0,n):
    Doc2Vect.append(m.infer_vector(corpus[i], alpha=start_alpha, steps=infer_epoch))
Doc2Vect = np.array(Doc2Vect)
print("Word2Vec Matrix shape: ", Doc2Vect.shape)

### TF e TFIDF

In [None]:
%%time
#50 secs
#Frequency
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(analyzer=lambda x:x)
dtm_tf = tf_vectorizer.fit_transform(corpus)

#tfidf
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(corpus)

print(dtm_tf.shape)
print(dtm_tfidf.shape)

### SVD

In [None]:
pca = TruncatedSVD(n_components=300)
TF_SVD = pca.fit_transform(dtm_tf)
TFIDF_SVD = pca.fit_transform(dtm_tfidf)

### GloVe Vectors

In [None]:
#ESTRAGGO VETTORE GLOVE DEL DOCUMENTO A PARTIRE DAI VETTORI DELLE SIGNOLE PAROLE
#Se la parola non è presente del vocabolario passo array di 0
from gensim.models import KeyedVectors
filename = 'glove.6B.300d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [None]:
GloVe = []
for document in corpus:
    documentRepr = []
    for word in document:
        try:
            documentRepr.append(model.get_vector(word))
        except Exception:
            documentRepr.append( np.zeros(300))
    GloVe.append( np.mean(np.array(documentRepr),axis=0) )
GloVe = np.array(GloVe)
GloVe.shape

### Clustering and Evaluation

In [None]:
#passare come origin 'svdtf','svdtfidf', 'doc2vec' o 'glove' in base a quale clustering e pca si vuole fare
origin = 'doc2vec'
clust_n = 5

#SVD TF
if origin=='svdtf':
    xx = MinMaxScaler().fit_transform(TF_SVD)
#SVD TF
elif origin=='svdtfidf':
    xx = MinMaxScaler().fit_transform(TFIDF_SVD)
#Doc2Vec
elif origin=='doc2vec':
    xx = MinMaxScaler().fit_transform(Doc2Vect)
#GloVe
elif origin=='glove':
    xx = MinMaxScaler().fit_transform(GloVe)



#KMEANS
clustering = KMeans(n_clusters=clust_n).fit(xx)
#clustering = AgglomerativeClustering(n_clusters=clust_n).fit(xx)
#clustering = SpectralClustering(n_clusters=clust_n, affinity='nearest_neighbors',assign_labels='discretize',n_neighbors = 100).fit(xx)

print("Rand Index: %0.3f"
      % metrics.rand_score(y, clustering.labels_))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(y, clustering.labels_))
print("Silhouette Coefficient: %0.3f"
  % metrics.silhouette_score(xx, clustering.labels_))
print()

### Visualization

In [None]:
def plotPCA(x,y,label,clust_n,title):
    fig, ax = plt.subplots(figsize=(16,9))
    sns.scatterplot(
        x=x, y=y,
        hue=label,
        palette=sns.color_palette("hls", clust_n),
        data=tsneDF,
        legend="full",
        alpha=0.75,
        s=40
    )
    plt.title(title)
    plt.show()

In [None]:
%matplotlib qt

if origin=='svdtf':
    principalComponents = TF_SVD

elif origin=='svdtfidf':
    principalComponents = TFIDF_SVD

elif origin=='doc2vec':
    principalComponents = Doc2Vect
    
elif origin=='glove':
    principalComponents = GloVe
    
##PCA
tsne = TSNE(n_components=2, verbose=2, perplexity=100, n_iter=500)
tsne_results = tsne.fit_transform(principalComponents)
tsneDF = pd.DataFrame(data = tsne_results, columns = ['PC1', 'PC2'])
tsneDF['Label'] = clustering.labels_
tsneDF['TrueLabel'] = y

#Plotting PCA visualization with cluster label and true labels
plotPCA(tsneDF['PC1'],tsneDF['PC2'],tsneDF['Label'],clust_n,str(origin + ': Clustering Labels: '))
plotPCA(tsneDF['PC1'],tsneDF['PC2'],tsneDF['TrueLabel'],clust_n,str(origin + ': True Labels: '))

#### Extracting Purity values for each cluster

In [None]:
ClusterGroups = tsneDF.groupby(['Label','TrueLabel']).PC1.count().reset_index()
sums = ClusterGroups.groupby('Label').sum().PC1.tolist()

PercInClust = []
for index, row in ClusterGroups.iterrows():
    PercInClust.append(row['PC1'] / sums[row['Label']])
ClusterGroups['Purity'] = [round(num, 2) for num in PercInClust]

In [None]:
mainGenre = ClusterGroups.loc[ClusterGroups.groupby(['Label'])['Purity'].idxmax()].rename(columns={'Label':'Cluster','TrueLabel':'MainGenre','PC1':'Count'})
mainGenre['total'] = sums
mainGenre

#### Extracting full confusion matrix

In [None]:
Results = pd.pivot_table(ClusterGroups, index = 'Label', columns = 'TrueLabel', values = 'PC1')
Results