In [1]:
%matplotlib inline 
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report


#K-Means import
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

from time import time
import numpy as np
import matplotlib.pyplot as plt


from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances


In [2]:
def bench_hierarchical(estimator, name, data,labels=None):
    t0 = time()
    estimator.fit(data)
    print('%-9s\t%.2fs\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, (time() - t0), 
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=None)))
    return estimator

# Aggolomerative Fold 1

In [3]:
from sklearn.dummy import DummyClassifier
traindata=pd.read_csv("../data/tsv/test1.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("../data/tsv/train1.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer(use_idf=True))])
clf.fit(X_train)


X_train=(clf.transform(X_train)).toarray()
X_test=clf.transform(X_test).toarray()

data=X_test
labels=np.array(Y_test)-1
# print(labels)

# Do K-Means
for rstate in [16]:
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cosine"), name="Cosine", data=data,labels=labels)

    km1_random=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="euclidean"), name="Euclidean", data=data,labels=labels)
    
    km1_pca=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cityblock"), name="city-block", data=data,labels=labels)


    
    print(82 * '_')


    print("Cosine Affinity")
    ypred=km1_km.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

    print("Euclidean Affinity")
    ypred=km1_random.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))


    print("City Block Affinity")
    ypred=km1_pca.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

Random State 16
__________________________________________________________________________________
init		time	homo	compl	v-meas	ARI	AMI	silhouette
Cosine   	0.21s	0.041	0.052	0.046	0.093	0.020	-0.000
Euclidean	0.19s	0.034	0.033	0.033	0.013	0.011	0.002
city-block	0.19s	0.021	0.158	0.036	0.003	0.004	-0.023
__________________________________________________________________________________
Cosine Affinity
              precision    recall  f1-score   support

           0    0.26613   0.56897   0.36264        58
           1    0.28205   0.12222   0.17054        90
           2    0.12500   0.08333   0.10000        12
           3    0.23077   0.12500   0.16216        24

    accuracy                        0.26087       184
   macro avg    0.22599   0.22488   0.19884       184
weighted avg    0.26010   0.26087   0.22540       184

Euclidean Affinity
              precision    recall  f1-score   support

           0    0.31111   0.48276   0.37838        58
           1    0.37255   0.2111

# Aggolomerative Fold 2

In [4]:
from sklearn.dummy import DummyClassifier
traindata=pd.read_csv("../data/tsv/test2.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("../data/tsv/test2.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer(use_idf=True))])
clf.fit(X_train)


X_train=(clf.transform(X_train)).toarray()
X_test=clf.transform(X_test).toarray()

data=X_test
labels=np.array(Y_test)-1
# print(labels)

# Do K-Means
for rstate in [16]:
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cosine"), name="Cosine", data=data,labels=labels)

    km1_random=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="euclidean"), name="Euclidean", data=data,labels=labels)
    
    km1_pca=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cityblock"), name="city-block", data=data,labels=labels)


    
    print(82 * '_')


    print("Cosine Affinity")
    ypred=km1_km.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

    print("Euclidean Affinity")
    ypred=km1_random.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))


    print("City Block Affinity")
    ypred=km1_pca.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

Random State 16
__________________________________________________________________________________
init		time	homo	compl	v-meas	ARI	AMI	silhouette
Cosine   	0.15s	0.049	0.059	0.053	0.046	0.029	0.000
Euclidean	0.15s	0.032	0.028	0.030	-0.000	0.009	0.002
city-block	0.14s	0.013	0.122	0.024	-0.006	-0.008	-0.012
__________________________________________________________________________________
Cosine Affinity
              precision    recall  f1-score   support

           0    0.33333   0.71429   0.45455        56
           1    0.54286   0.21111   0.30400        90
           2    0.50000   0.20000   0.28571        15
           3    0.08696   0.08696   0.08696        23

    accuracy                        0.34783       184
   macro avg    0.36579   0.30309   0.28280       184
weighted avg    0.41861   0.34783   0.32120       184

Euclidean Affinity
              precision    recall  f1-score   support

           0    0.24324   0.32143   0.27692        56
           1    0.46667   0.23

# Aggolomerative with Learnt BERT Representations

In [5]:
import pickle

modelfolder="../../RQ2.2/data/bert/fold1/"

X_train= pickle.load(open(modelfolder+'xtrain.pkl', 'rb'))
Y_train= pickle.load(open(modelfolder+'ytrain.pkl', 'rb'))
X_test= pickle.load(open(modelfolder+'xtest.pkl', 'rb'))
Y_test= pickle.load(open(modelfolder+'ytest.pkl', 'rb'))






data=X_test
labels=np.array(Y_test)
# print(labels)

# Do K-Means
for rstate in [16]:
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cosine"), name="Cosine", data=data,labels=labels)

    km1_random=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="euclidean"), name="Euclidean", data=data,labels=labels)
    
    km1_pca=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cityblock"), name="city-block", data=data,labels=labels)


    
    print(82 * '_')


    print("Cosine Affinity")
    ypred=km1_km.labels_
    print(classification_report(np.array(Y_test),ypred,digits=5))

    print("Euclidean Affinity")
    ypred=km1_random.labels_
    print(classification_report(np.array(Y_test),ypred,digits=5))


    print("City Block Affinity")
    ypred=km1_pca.labels_
    print(classification_report(np.array(Y_test),ypred,digits=5))

Random State 16
__________________________________________________________________________________
init		time	homo	compl	v-meas	ARI	AMI	silhouette
Cosine   	0.04s	0.014	0.021	0.017	0.034	-0.008	0.222
Euclidean	0.02s	0.019	0.032	0.024	0.013	-0.002	0.169
city-block	0.03s	0.033	0.038	0.035	0.032	0.014	0.182
__________________________________________________________________________________
Cosine Affinity
              precision    recall  f1-score   support

           0    0.28788   0.65517   0.40000        58
           1    0.33333   0.07778   0.12613        90
           2    0.10000   0.25000   0.14286        12
           3    0.00000   0.00000   0.00000        24

    accuracy                        0.26087       184
   macro avg    0.18030   0.24574   0.16725       184
weighted avg    0.26031   0.26087   0.19710       184

Euclidean Affinity
              precision    recall  f1-score   support

           0    0.31429   0.75862   0.44444        58
           1    0.51515   0.1888

In [6]:
import pickle

modelfolder="../../RQ2.2/data/bert/fold1/"

X_train= pickle.load(open(modelfolder+'xtrain.pkl', 'rb'))
Y_train= pickle.load(open(modelfolder+'ytrain.pkl', 'rb'))
X_test= pickle.load(open(modelfolder+'xtest.pkl', 'rb'))
Y_test= pickle.load(open(modelfolder+'ytest.pkl', 'rb'))

data=X_train
labels=np.array(Y_train)
# print(labels)

# Do K-Means
data=X_test
labels=np.array(Y_test)
# print(labels)

# Do K-Means
for rstate in [16]:
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cosine"), name="Cosine", data=data,labels=labels)

    km1_random=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="euclidean"), name="Euclidean", data=data,labels=labels)
    
    km1_pca=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cityblock"), name="city-block", data=data,labels=labels)


    
    print(82 * '_')


    print("Cosine Affinity")
    ypred=km1_km.labels_
    print(classification_report(np.array(Y_test),ypred,digits=5))

    print("Euclidean Affinity")
    ypred=km1_random.labels_
    print(classification_report(np.array(Y_test),ypred,digits=5))


    print("City Block Affinity")
    ypred=km1_pca.labels_
    print(classification_report(np.array(Y_test),ypred,digits=5))

Random State 16
__________________________________________________________________________________
init		time	homo	compl	v-meas	ARI	AMI	silhouette
Cosine   	0.03s	0.016	0.061	0.025	0.003	-0.008	0.204
Euclidean	0.03s	0.022	0.033	0.027	0.011	0.002	0.223
city-block	0.03s	0.016	0.018	0.017	-0.012	-0.007	0.193
__________________________________________________________________________________
Cosine Affinity
              precision    recall  f1-score   support

           0    0.30409   0.92857   0.45815        56
           1    0.33333   0.01111   0.02151        90
           2    0.00000   0.00000   0.00000        15
           3    0.11111   0.04348   0.06250        23

    accuracy                        0.29348       184
   macro avg    0.18713   0.24579   0.13554       184
weighted avg    0.26948   0.29348   0.15777       184

Euclidean Affinity
              precision    recall  f1-score   support

           0    0.29358   0.57143   0.38788        56
           1    0.50000   0.022