# Import Libraries

In [None]:
%matplotlib inline 
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report


#K-Means import
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

from time import time
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import LatentDirichletAllocation


from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
from sklearn.cluster import SpectralClustering

# Benchmarking Spectral Clustering

In [None]:
def bench_spectral(estimator, name, data,labels=None):
    t0 = time()
    estimator.fit(data)
    print('%-9s\t%.2fs\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, (time() - t0), 
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=None)))
    return estimator

# Spectral Fold 1

* This code runs spectral clustering with TF-IDF model for phrase-rubric classification of keyphrases
    
* The training and testing for each of the phases is seperated.  

* This can be modified using line ```traindata=pd.read_csv("../data/test1.tsv",delimiter="\t")``` and ```traindata=pd.read_csv("../data/train1.tsv",delimiter="\t")```

* Since randomness impacts results, we search for randomness parameter using ```for rstate in [16]:```


* Latent dirichlet allocation is done using ```lda = LatentDirichletAllocation(n_components=4,random_state=42) lda.fit(X_train)```

In [None]:
from sklearn.dummy import DummyClassifier
traindata=pd.read_csv("../data/tsv/test1.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("../data/tsv/train1.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2)))])
clf.fit(X_train)

X_train=(clf.transform(X_train)).toarray()
X_test=clf.transform(X_test)

lda = LatentDirichletAllocation(n_components=4,random_state=42)
lda.fit(X_train)

data=lda.transform(X_train)
labels=np.array(Y_train)-1
# print(labels)
X_test=lda.transform(X_test)
# Do K-Means
data=X_test
labels=np.array(Y_test)-1
# print(labels)

# Do K-Means
for rstate in [7]:
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_spectral(SpectralClustering(n_clusters=4,random_state=rstate,assign_labels="kmeans"), name="K-Means", data=data,labels=labels)

    km1_random=bench_spectral(SpectralClustering(n_clusters=4,random_state=rstate,assign_labels="discretize"), name="Discritize", data=data,labels=labels)
    
  
    print(82 * '_')


    print("Cosine Affinity")
    ypred=km1_km.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

    print("Euclidean Affinity")
    ypred=km1_random.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))


# Spectral clustering Fold 2

* This code runs spectral with TF-IDF model for phrase-rubric classification of keyphrases - fold 2
    
* The training and testing for each of the phases is seperated.  

* This can be modified using line ```traindata=pd.read_csv("../data/test1.tsv",delimiter="\t")``` and ```traindata=pd.read_csv("../data/train1.tsv",delimiter="\t")```

* Since randomness impacts results, we search for randomness parameter using ```for rstate in [16]:```


* Latent dirichlet allocation is done using ```lda = LatentDirichletAllocation(n_components=4,random_state=42) lda.fit(X_train)```

In [None]:
from sklearn.dummy import DummyClassifier
traindata=pd.read_csv("../data/tsv/test2.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("../data/tsv/train2.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2)))])
clf.fit(X_train)

X_train=(clf.transform(X_train)).toarray()
X_test=clf.transform(X_test)

lda = LatentDirichletAllocation(n_components=4,random_state=42)
lda.fit(X_train)

data=lda.transform(X_train)
labels=np.array(Y_train)-1
# print(labels)
X_test=lda.transform(X_test)
# print(labels)

data=X_test
labels=np.array(Y_test)-1
# print(labels)

# Do K-Means
for rstate in [0]:
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_spectral(SpectralClustering(n_clusters=4,random_state=rstate,assign_labels="kmeans"), name="K-Means", data=data,labels=labels)

    km1_random=bench_spectral(SpectralClustering(n_clusters=4,random_state=rstate,assign_labels="discretize"), name="Discritize", data=data,labels=labels)
    
  
    print(82 * '_')


    print("Cosine Affinity")
    ypred=km1_km.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

    print("Euclidean Affinity")
    ypred=km1_random.labels_
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))
