<a href="https://colab.research.google.com/github/m-and-ms/Question-Clustering-/blob/master/glove_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [0]:
Q_df=pd.read_json (r'WebQSP.train.json')


In [0]:
Q_df

In [0]:
Q_df.columns

In [0]:
Q_df['Questions'][1]['ProcessedQuestion']

In [0]:
num_questions=len(Q_df['Questions'])

In [0]:
data_sentences=[Q_df['Questions'][idx]['ProcessedQuestion'] for idx in range(num_questions) ]

In [0]:
len(data_sentences)

In [0]:

from nltk import *
from nltk.corpus import stopwords 
stemmer = nltk.stem.porter.PorterStemmer()

nltk.download('stopwords')

In [0]:
stop_words = set(stopwords.words('english')) 


In [0]:
def tokenize_and_stem(text):
    tokens = nltk.word_tokenize(text)
    print(tokens)
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token) and len(token) >= 3 :
            print(token)
            filtered_tokens.append(token)
    #stems = [stemmer.stem(t) for t in filtered_tokens]
    words = [w.lower() for w in filtered_tokens if not w in stop_words]
    return words

In [0]:
data_tokens=[]
for sent in data_sentences:
  data_tokens.append( tokenize_and_stem(sent))

In [0]:
data_tokens

In [0]:
def read_glove(file_name):
    with open(file_name,'r') as f:
        word_vocab = set() 
        word2vector = {}
        for line in f:
            line_ = line.strip() 
            words_Vec = line_.split()
            word_vocab.add(words_Vec[0])
            word2vector[words_Vec[0]] = np.array(words_Vec[1:],dtype=float)
    print("Total Words in DataSet:",len(word_vocab))
    return word_vocab,word2vector

In [0]:
vocab, w2v = read_glove("glove.6B.300d.txt")


In [0]:
print (w2v['animal'].shape)

In [0]:
def word_average(sentence, embed,vocab):
    sent_vec =[]
    nwords = 0
    for word in sentence:
      if nwords == 0:
        if(word in vocab):
          sent_vec = embed[word]
        else:
          sent_vec = np.random.rand(300,)
      else:
        if(word in vocab):
          sent_vec = np.add(sent_vec, embed[word])
        else:
          sent_vec=np.add(sent_vec,np.random.rand(300,)) 
      nwords+=1
   
    return np.asarray(sent_vec) / nwords

In [0]:
sent_represent=[]
for sents in data_tokens :

  sent_represent.append(word_average(sents,w2v,vocab))


In [0]:
from sklearn.cluster import AgglomerativeClustering


In [0]:
cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')


In [0]:
cluster.fit(sent_represent)

In [0]:
labels = cluster.labels_



In [0]:
print(labels)

In [0]:
cluster.fit_predict(sent_represent)

In [0]:
import matplotlib.pyplot as plt
 
from sklearn.manifold import TSNE

In [0]:
model = TSNE(n_components=2, random_state=1,metric='euclidean', n_iter=15000)


In [0]:
Y=model.fit_transform(sent_represent)


In [0]:
plt.scatter(Y[:, 0], Y[:, 1], c=labels)

In [0]:
import scipy.cluster.hierarchy as shc

plt.figure(figsize=(10, 7))
plt.title("Data Dendograms")
dend = shc.dendrogram(shc.linkage(np.asarray(sent_represent), method='ward'))

In [0]:
from sklearn.cluster import KMeans

In [0]:
kmeans = KMeans(n_clusters=2, random_state=0)
km_tf=kmeans.fit_transform(sent_represent)
kmeans.labels_


In [0]:
kmeans.cluster_centers_.shape


In [0]:
type1_occurrences = np.count_nonzero(kmeans.labels_ == 1)

In [0]:
type2_occurences=np.count_nonzero(kmeans.labels_ == 0)

In [0]:
total_occurances=len(kmeans.labels_)

In [0]:
print("Type1 percentage",(type1_occurrences/total_occurances)*100 ,"Type2 percentage ",(type2_occurences/total_occurances)*100 )

In [0]:
print("Type1 count",(type1_occurrences) ,"Type2 count ",(type2_occurences))

In [0]:
km_tf.shape

In [0]:
plt.scatter(km_tf[:,0],km_tf[:,1],c=kmeans.labels_,marker='X')

In [0]:
from sklearn.cluster import KMeans 
from sklearn import metrics 
from scipy.spatial.distance import cdist 

In [0]:
distortions = [] 
inertias = [] 
mapping1 = {} 
mapping2 = {} 
K = range(1,10) 
  
for k in K: 
    #Building and fitting the model 
    kmeanModel = KMeans(n_clusters=k).fit(sent_represent) 
    kmeanModel.fit(sent_represent)     
      
    distortions.append(sum(np.min(cdist(sent_represent, kmeanModel.cluster_centers_, 
                      'euclidean'),axis=1)) / np.asarray(sent_represent).shape[0]) 
    inertias.append(kmeanModel.inertia_) 
  
    mapping1[k] = sum(np.min(cdist(sent_represent, kmeanModel.cluster_centers_, 
                 'euclidean'),axis=1)) / np.asarray( sent_represent).shape[0] 
    mapping2[k] = kmeanModel.inertia_ 

In [0]:
plt.plot(K, distortions, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Distortion') 
plt.title('The Elbow Method using Distortion') 
plt.show() 

In [0]:
from sklearn import decomposition
import matplotlib 
pca = decomposition.PCA(n_components=2).fit(sent_represent)
colors = ['blue','purple']

colors_cluster=['red','green','yellow'] 
label_colors=[colors_cluster[i] for i in kmeans.labels_]
coords=pca.transform(sent_represent)

plt.scatter(coords[:,0],coords[:,1],c=label_colors)

centroids=kmeans.cluster_centers_
centroidcoords=pca.transform(centroids)
plt.scatter(centroidcoords[:,0],centroidcoords[:,1],marker='X', cmap=matplotlib.colors.ListedColormap(colors))
plt.show()