In [1]:
%matplotlib inline 
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report


#K-Means import
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

from time import time
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import LatentDirichletAllocation


from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances


In [2]:
def bench_hierarchical(estimator, name, data,labels=None):
    t0 = time()
    estimator.fit(data)
    print('%-9s\t%.2fs\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, (time() - t0), 
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=None)))
    return estimator

# Hierarchical Fold 1

In [8]:
from sklearn.dummy import DummyClassifier
traindata=pd.read_csv("../data/tsv/test1.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("../data/tsv/train1.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2)))])
clf.fit(X_train)

X_train=(clf.transform(X_train)).toarray()
X_test=clf.transform(X_test)

lda = LatentDirichletAllocation(n_components=4,random_state=42)
lda.fit(X_train)

data=lda.transform(X_train)
labels=np.array(Y_train)-1
# print(labels)
X_test=lda.transform(X_test)
# Do K-Means
for rstate in [37]:
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cosine"), name="Cosine", data=X_test,labels=Y_test)

    km1_random=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="euclidean"), name="Euclidean", data=X_test,labels=Y_test)
    
    km1_pca=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cityblock"), name="city-block", data=X_test,labels=Y_test)


    
    print(82 * '_')


    print("Cosine Affinity")
    ypred=km1_km.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

    print("Euclidean Affinity")
    ypred=km1_random.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))


    print("City Block Affinity")
    ypred=km1_pca.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

Random State 37
__________________________________________________________________________________
init		time	homo	compl	v-meas	ARI	AMI	silhouette
Cosine   	0.00s	0.042	0.073	0.054	-0.012	0.024	0.477
Euclidean	0.00s	0.040	0.072	0.052	-0.013	0.022	0.482
city-block	0.02s	0.026	0.047	0.034	-0.013	0.004	0.478
__________________________________________________________________________________
Cosine Affinity
              precision    recall  f1-score   support

           0    0.33333   0.86207   0.48077        58
           1    0.28571   0.02222   0.04124        90
           2    0.00000   0.00000   0.00000        12
           3    0.20000   0.12500   0.15385        24

    accuracy                        0.29891       184
   macro avg    0.20476   0.25232   0.16896       184
weighted avg    0.27091   0.29891   0.19178       184

Euclidean Affinity
              precision    recall  f1-score   support

           0    0.33775   0.87931   0.48804        58
           1    0.75000   0.100

# Hierarchical Fold 2

In [9]:
from sklearn.dummy import DummyClassifier
traindata=pd.read_csv("../data/tsv/test2.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("../data/tsv/train2.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2)))])
clf.fit(X_train)

X_train=(clf.transform(X_train)).toarray()
X_test=clf.transform(X_test)

lda = LatentDirichletAllocation(n_components=4,random_state=42)
lda.fit(X_train)

data=lda.transform(X_train)
labels=np.array(Y_train)-1
# print(labels)
X_test=lda.transform(X_test)
# print(labels)

for rstate in [0]:
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cosine"), name="Cosine", data=X_test,labels=Y_test)

    km1_random=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="euclidean"), name="Euclidean", data=X_test,labels=Y_test)
    
    km1_pca=bench_hierarchical(AgglomerativeClustering(n_clusters=4,
                                    linkage="complete", affinity="cityblock"), name="city-block", data=X_test,labels=Y_test)


    
    print(82 * '_')


    print("Cosine Affinity")
    ypred=km1_km.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

    print("Euclidean Affinity")
    ypred=km1_random.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))


    print("City Block Affinity")
    ypred=km1_pca.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

Random State 0
__________________________________________________________________________________
init		time	homo	compl	v-meas	ARI	AMI	silhouette
Cosine   	0.00s	0.038	0.038	0.038	0.006	0.016	0.390
Euclidean	0.00s	0.045	0.042	0.043	0.017	0.022	0.346
city-block	0.00s	0.031	0.028	0.029	-0.008	0.009	0.353
__________________________________________________________________________________
Cosine Affinity
              precision    recall  f1-score   support

           0    0.26437   0.41071   0.32168        56
           1    0.52381   0.12222   0.19820        90
           2    0.01613   0.06667   0.02597        15
           3    0.14286   0.08696   0.10811        23

    accuracy                        0.20109       184
   macro avg    0.23679   0.17164   0.16349       184
weighted avg    0.35584   0.20109   0.21048       184

Euclidean Affinity
              precision    recall  f1-score   support

           0    0.24324   0.32143   0.27692        56
           1    0.47170   0.27778 