In [1]:
%matplotlib inline 
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report


#K-Means import
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

from time import time
import numpy as np
import matplotlib.pyplot as plt


from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances


In [2]:
def bench_hierarchical(estimator, name, data,labels=None):
    t0 = time()
    estimator.fit(data)
    print('%-9s\t%.2fs\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, (time() - t0), 
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=None)))
    return estimator

# Aggolomerative Fold 1

In [13]:
from sklearn.dummy import DummyClassifier
traindata=pd.read_csv("./test1.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("./train1.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer(use_idf=True))])
clf.fit(X_train)


X_train=(clf.transform(X_train)).toarray()
X_test=clf.transform(X_test).toarray()

data=X_test
labels=np.array(Y_test)-1
# print(labels)

# Do K-Means
for rstate in [16]:
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cosine"), name="Cosine", data=data,labels=labels)

    km1_random=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="euclidean"), name="Euclidean", data=data,labels=labels)
    
    km1_pca=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cityblock"), name="city-block", data=data,labels=labels)


    
    print(82 * '_')


    print("Cosine Affinity")
    ypred=km1_km.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

    print("Euclidean Affinity")
    ypred=km1_random.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))


    print("City Block Affinity")
    ypred=km1_pca.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

Random State 16
__________________________________________________________________________________
init		time	homo	compl	v-meas	ARI	AMI	silhouette
Cosine   	0.20s	0.041	0.052	0.046	0.093	0.020	-0.000
Euclidean	0.20s	0.034	0.033	0.033	0.013	0.011	0.002
city-block	0.20s	0.021	0.158	0.036	0.003	0.004	-0.023
__________________________________________________________________________________
Cosine Affinity
              precision    recall  f1-score   support

           0    0.26613   0.56897   0.36264        58
           1    0.28205   0.12222   0.17054        90
           2    0.12500   0.08333   0.10000        12
           3    0.23077   0.12500   0.16216        24

    accuracy                        0.26087       184
   macro avg    0.22599   0.22488   0.19884       184
weighted avg    0.26010   0.26087   0.22540       184

Euclidean Affinity
              precision    recall  f1-score   support

           0    0.31111   0.48276   0.37838        58
           1    0.37255   0.2111

# Aggolomerative Fold 2

In [14]:
from sklearn.dummy import DummyClassifier
traindata=pd.read_csv("./test2.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("./train2.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer(use_idf=True))])
clf.fit(X_train)


X_train=(clf.transform(X_train)).toarray()
X_test=clf.transform(X_test).toarray()

data=X_test
labels=np.array(Y_test)-1
# print(labels)

# Do K-Means
for rstate in [16]:
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cosine"), name="Cosine", data=data,labels=labels)

    km1_random=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="euclidean"), name="Euclidean", data=data,labels=labels)
    
    km1_pca=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cityblock"), name="city-block", data=data,labels=labels)


    
    print(82 * '_')


    print("Cosine Affinity")
    ypred=km1_km.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

    print("Euclidean Affinity")
    ypred=km1_random.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))


    print("City Block Affinity")
    ypred=km1_pca.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

Random State 16
__________________________________________________________________________________
init		time	homo	compl	v-meas	ARI	AMI	silhouette
Cosine   	0.19s	0.019	0.025	0.021	0.008	-0.006	0.002
Euclidean	0.19s	0.035	0.037	0.036	0.015	0.013	0.004
city-block	0.19s	0.026	0.131	0.043	0.021	0.011	-0.023
__________________________________________________________________________________
Cosine Affinity
              precision    recall  f1-score   support

           0    0.29851   0.71429   0.42105        56
           1    0.30000   0.03333   0.06000        90
           2    0.00000   0.00000   0.00000        15
           3    0.13043   0.13043   0.13043        23

    accuracy                        0.25000       184
   macro avg    0.18224   0.21951   0.15287       184
weighted avg    0.25389   0.25000   0.17380       184

Euclidean Affinity
              precision    recall  f1-score   support

           0    0.25843   0.41071   0.31724        56
           1    0.44444   0.3111

# Aggolomerative with Learnt BERT Representations

In [15]:
import pickle

X_train= pickle.load(open('./bert_features/fold1/xtrain.pkl', 'rb'))
Y_train= pickle.load(open('./bert_features/fold1/ytrain.pkl', 'rb'))
X_test= pickle.load(open('./bert_features/fold1/xtest.pkl', 'rb'))
Y_test= pickle.load(open('./bert_features/fold1/ytest.pkl', 'rb'))






data=X_test
labels=np.array(Y_test)
# print(labels)

# Do K-Means
for rstate in [16]:
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cosine"), name="Cosine", data=data,labels=labels)

    km1_random=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="euclidean"), name="Euclidean", data=data,labels=labels)
    
    km1_pca=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cityblock"), name="city-block", data=data,labels=labels)


    
    print(82 * '_')


    print("Cosine Affinity")
    ypred=km1_km.labels_
    print(classification_report(np.array(Y_test),ypred,digits=5))

    print("Euclidean Affinity")
    ypred=km1_random.labels_
    print(classification_report(np.array(Y_test),ypred,digits=5))


    print("City Block Affinity")
    ypred=km1_pca.labels_
    print(classification_report(np.array(Y_test),ypred,digits=5))

Random State 16
__________________________________________________________________________________
init		time	homo	compl	v-meas	ARI	AMI	silhouette
Cosine   	0.04s	0.014	0.021	0.017	0.034	-0.008	0.222
Euclidean	0.02s	0.019	0.032	0.024	0.013	-0.002	0.169
city-block	0.03s	0.033	0.038	0.035	0.032	0.014	0.182
__________________________________________________________________________________
Cosine Affinity
              precision    recall  f1-score   support

           0    0.28788   0.65517   0.40000        58
           1    0.33333   0.07778   0.12613        90
           2    0.10000   0.25000   0.14286        12
           3    0.00000   0.00000   0.00000        24

    accuracy                        0.26087       184
   macro avg    0.18030   0.24574   0.16725       184
weighted avg    0.26031   0.26087   0.19710       184

Euclidean Affinity
              precision    recall  f1-score   support

           0    0.31429   0.75862   0.44444        58
           1    0.51515   0.1888

In [59]:
import pickle

X_train= pickle.load(open('./bert_features/fold2/xtrain.pkl', 'rb'))
Y_train= pickle.load(open('./bert_features/fold2/ytrain.pkl', 'rb'))
X_test= pickle.load(open('./bert_features/fold2/xtest.pkl', 'rb'))
Y_test= pickle.load(open('./bert_features/fold2/ytest.pkl', 'rb'))


data=X_train
labels=np.array(Y_train)
# print(labels)

# Do K-Means
data=X_test
labels=np.array(Y_test)
# print(labels)

# Do K-Means
for rstate in [16]:
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cosine"), name="Cosine", data=data,labels=labels)

    km1_random=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="euclidean"), name="Euclidean", data=data,labels=labels)
    
    km1_pca=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cityblock"), name="city-block", data=data,labels=labels)


    
    print(82 * '_')


    print("Cosine Affinity")
    ypred=km1_km.labels_
    print(classification_report(np.array(Y_test),ypred,digits=5))

    print("Euclidean Affinity")
    ypred=km1_random.labels_
    print(classification_report(np.array(Y_test),ypred,digits=5))


    print("City Block Affinity")
    ypred=km1_pca.labels_
    print(classification_report(np.array(Y_test),ypred,digits=5))

Random State 1
__________________________________________________________________________________
init		time	inertia	homo	compl	v-meas	ARI	AMI	silhouette
k-means++	0.35s	3281	0.023	0.024	0.023	0.030	0.008	0.247
random   	0.37s	3293	0.034	0.035	0.035	0.035	0.020	0.232
PCA-based	0.05s	3293	0.022	0.022	0.022	0.023	0.008	0.241
__________________________________________________________________________________
kmeans++ Initialization
              precision    recall  f1-score   support

           0    0.29630   0.42857   0.35036        56
           1    0.50000   0.38889   0.43750        90
           2    0.16667   0.33333   0.22222        15
           3    0.33333   0.04348   0.07692        23

    accuracy                        0.35326       184
   macro avg    0.32407   0.29857   0.27175       184
weighted avg    0.39000   0.35326   0.34836       184

Random Initialization
              precision    recall  f1-score   support

           0    0.35897   0.25000   0.29474        56
  