In [1]:
%matplotlib inline 
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report


#K-Means import
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

from time import time
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import LatentDirichletAllocation


from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
from sklearn.cluster import SpectralClustering

In [2]:
def bench_hierarchical(estimator, name, data,labels=None):
    t0 = time()
    estimator.fit(data)
    print('%-9s\t%.2fs\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, (time() - t0), 
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=None)))
    return estimator

# Hierarchical Fold 1

In [3]:
from sklearn.dummy import DummyClassifier
traindata=pd.read_csv("./test1.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("./train1.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2)))])
clf.fit(X_train)

X_train=(clf.transform(X_train)).toarray()
X_test=clf.transform(X_test)

lda = LatentDirichletAllocation(n_components=4,random_state=42)
lda.fit(X_train)

data=lda.transform(X_train)
labels=np.array(Y_train)-1
# print(labels)
X_test=lda.transform(X_test)
# Do K-Means
data=X_test
labels=np.array(Y_test)-1
# print(labels)

# Do K-Means
for rstate in range(0,50):
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_hierarchical(SpectralClustering(n_clusters=4,random_state=rstate,assign_labels="kmeans"), name="K-Means", data=data,labels=labels)

    km1_random=bench_hierarchical(SpectralClustering(n_clusters=4,random_state=rstate,assign_labels="discretize"), name="Discritize", data=data,labels=labels)
    
  
    print(82 * '_')


    print("Cosine Affinity")
    ypred=km1_km.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

    print("Euclidean Affinity")
    ypred=km1_random.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))


Random State 1
__________________________________________________________________________________
init		time	homo	compl	v-meas	ARI	AMI	silhouette
K-Means  	0.09s	0.015	0.018	0.016	0.006	-0.009	0.473
Discritize	0.03s	0.015	0.014	0.014	-0.012	-0.009	0.460
__________________________________________________________________________________
Cosine Affinity
              precision    recall  f1-score   support

           0    0.28571   0.20690   0.24000        58
           1    0.50000   0.66667   0.57143        90
           2    0.00000   0.00000   0.00000        12
           3    0.16667   0.12500   0.14286        24

    accuracy                        0.40761       184
   macro avg    0.23810   0.24964   0.23857       184
weighted avg    0.35637   0.40761   0.37379       184

Euclidean Affinity
              precision    recall  f1-score   support

           0    0.23529   0.06897   0.10667        58
           1    0.50000   0.14444   0.22414        90
           2    0.09375   0.75

# Hierarchical Fold 2

In [4]:
from sklearn.dummy import DummyClassifier
traindata=pd.read_csv("./test2.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("./train2.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2)))])
clf.fit(X_train)

X_train=(clf.transform(X_train)).toarray()
X_test=clf.transform(X_test)

lda = LatentDirichletAllocation(n_components=4,random_state=42)
lda.fit(X_train)

data=lda.transform(X_train)
labels=np.array(Y_train)-1
# print(labels)
X_test=lda.transform(X_test)
# print(labels)

data=X_test
labels=np.array(Y_test)-1
# print(labels)

# Do K-Means
for rstate in range(0,50):
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_hierarchical(SpectralClustering(n_clusters=4,random_state=rstate,assign_labels="kmeans"), name="K-Means", data=data,labels=labels)

    km1_random=bench_hierarchical(SpectralClustering(n_clusters=4,random_state=rstate,assign_labels="discretize"), name="Discritize", data=data,labels=labels)
    
  
    print(82 * '_')


    print("Cosine Affinity")
    ypred=km1_km.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

    print("Euclidean Affinity")
    ypred=km1_random.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))


Random State 1
__________________________________________________________________________________
init		time	homo	compl	v-meas	ARI	AMI	silhouette
K-Means  	0.06s	0.052	0.048	0.050	0.005	0.029	0.410
Discritize	0.03s	0.041	0.035	0.038	0.011	0.018	0.419
__________________________________________________________________________________
Cosine Affinity
              precision    recall  f1-score   support

           0    0.05882   0.01786   0.02740        56
           1    0.50000   0.38889   0.43750        90
           2    0.01667   0.06667   0.02667        15
           3    0.02703   0.04348   0.03333        23

    accuracy                        0.20652       184
   macro avg    0.15063   0.12922   0.13122       184
weighted avg    0.26721   0.20652   0.22867       184

Euclidean Affinity
              precision    recall  f1-score   support

           0    0.33333   0.26786   0.29703        56
           1    0.42105   0.26667   0.32653        90
           2    0.11111   0.20000