# Import Libraries

In [None]:
%matplotlib inline 
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report


#K-Means import
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

from time import time
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import LatentDirichletAllocation

# K-Means Benchmarking Code


In [1]:

def bench_k_means(estimator, name, data,labels=None):
    t0 = time()
    estimator.fit(data)
    print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=None)))
    return estimator

# K-Means Fold 1

* This code runs K-MEANS with TF-IDF model for phrase-rubric classification of keyphrases
    
* The training and testing for each of the phases is seperated.  

* This can be modified using line ```traindata=pd.read_csv("../data/test1.tsv",delimiter="\t")``` and ```traindata=pd.read_csv("../data/train1.tsv",delimiter="\t")```

* Since randomness impacts results, we search for randomness parameter using ```for rstate in [16]:```

* Latent dirichlet allocation is done using ```lda = LatentDirichletAllocation(n_components=4,random_state=42) lda.fit(X_train)```

In [None]:
from sklearn.dummy import DummyClassifier
traindata=pd.read_csv("../data/tsv/test1.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("../data/tsv/train1.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2)))])
clf.fit(X_train)

X_train=(clf.transform(X_train)).toarray()
X_test=clf.transform(X_test)

lda = LatentDirichletAllocation(n_components=4,random_state=42)
lda.fit(X_train)

data=lda.transform(X_train)
labels=np.array(Y_train)-1
# print(labels)
X_test=lda.transform(X_test)
# Do K-Means
for rstate in [27]:
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_k_means(KMeans(init='k-means++', n_clusters=4, n_init=10,random_state=rstate), name="k-means++", data=data,labels=labels)

    km1_random=bench_k_means(KMeans(init='random', n_clusters=4, n_init=10,random_state=rstate), name="random", data=data,labels=labels)


    pca = PCA(n_components=4).fit(data) 
    km1_pca=bench_k_means(KMeans(init=pca.components_, n_clusters=4, n_init=1,random_state=rstate), name="PCA-based", data=data,labels=labels)
    print(82 * '_')


    print("kmeans++ Initialization")
    ypred=km1_km.predict(X_test)
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

    print("Random Initialization")
    ypred=km1_random.predict(X_test)
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))


    print("PCA Components Initialization")
    ypred=km1_pca.predict(X_test)
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

# K-Means Fold 2

* This code runs K-MEANS with TF-IDF model for phrase-rubric classification of keyphrases
    
* The training and testing for each of the phases is seperated.  

* This can be modified using line ```traindata=pd.read_csv("../data/test1.tsv",delimiter="\t")``` and ```traindata=pd.read_csv("../data/train1.tsv",delimiter="\t")```

* Since randomness impacts results, we search for randomness parameter using ```for rstate in [16]:```


* Latent dirichlet allocation is done using ```lda = LatentDirichletAllocation(n_components=4,random_state=42) lda.fit(X_train)```

In [None]:
from sklearn.dummy import DummyClassifier
traindata=pd.read_csv("../data/tsv/test2.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("../data/tsv/train2.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2)))])
clf.fit(X_train)

X_train=(clf.transform(X_train)).toarray()
X_test=clf.transform(X_test)

lda = LatentDirichletAllocation(n_components=4,random_state=42)
lda.fit(X_train)

data=lda.transform(X_train)
labels=np.array(Y_train)-1
# print(labels)
X_test=lda.transform(X_test)
# print(labels)

# Do K-Means
# Do K-Means
for rstate in [27]:
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_k_means(KMeans(init='k-means++', n_clusters=4, n_init=10,random_state=rstate), name="k-means++", data=data,labels=labels)

    km1_random=bench_k_means(KMeans(init='random', n_clusters=4, n_init=10,random_state=rstate), name="random", data=data,labels=labels)


    pca = PCA(n_components=4).fit(data) 
    km1_pca=bench_k_means(KMeans(init=pca.components_, n_clusters=4, n_init=1,random_state=rstate), name="PCA-based", data=data,labels=labels)
    print(82 * '_')


    print("kmeans++ Initialization")
    ypred=km1_km.predict(X_test)
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))

    print("Random Initialization")
    ypred=km1_random.predict(X_test)
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))


    print("PCA Components Initialization")
    ypred=km1_pca.predict(X_test)
    print(classification_report(np.array(Y_test)-1,ypred,digits=5))


# K-Means with Learnt BERT Representations

* This code runs K-MEANS with BERT model for phrase-rubric classification of keyphrases in fold-2 datasets
    
* The training and testing for each of the phases is seperated.  

* Fold selection and model selection could be done using 

```
modelfolder="../../RQ2.2/data/bert/fold1/"

X_train= pickle.load(open(modelfolder+'xtrain.pkl', 'rb'))
Y_train= pickle.load(open(modelfolder+'ytrain.pkl', 'rb'))
X_test= pickle.load(open(modelfolder+'xtest.pkl', 'rb'))
Y_test= pickle.load(open(modelfolder+'ytest.pkl', 'rb'))
```

* Since randomness impacts results, we search for randomness parameter using ```for rstate in [2]:```

In [None]:
import pickle

modelfolder="../../RQ2.2/data/bert/fold1/"

X_train= pickle.load(open(modelfolder+'xtrain.pkl', 'rb'))
Y_train= pickle.load(open(modelfolder+'ytrain.pkl', 'rb'))
X_test= pickle.load(open(modelfolder+'xtest.pkl', 'rb'))
Y_test= pickle.load(open(modelfolder+'ytest.pkl', 'rb'))



data=X_train
labels=np.array(Y_train)
# print(labels)

# Do K-Means
for rstate in [23]:
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_k_means(KMeans(init='k-means++', n_clusters=4, n_init=10,random_state=rstate), name="k-means++", data=data,labels=labels)

    km1_random=bench_k_means(KMeans(init='random', n_clusters=4, n_init=10,random_state=rstate), name="random", data=data,labels=labels)


    pca = PCA(n_components=4).fit(data) 
    km1_pca=bench_k_means(KMeans(init=pca.components_, n_clusters=4, n_init=1,random_state=rstate), name="PCA-based", data=data,labels=labels)
    print(82 * '_')


    print("kmeans++ Initialization")
    ypred=km1_km.predict(X_test)
    print(classification_report(np.array(Y_test),ypred,digits=5))

    print("Random Initialization")
    ypred=km1_random.predict(X_test)
    print(classification_report(np.array(Y_test),ypred,digits=5))


    print("PCA Components Initialization")
    ypred=km1_pca.predict(X_test)
    print(classification_report(np.array(Y_test),ypred,digits=5))

* This code runs K-MEANS with BERT model for phrase-rubric classification of keyphrases in fold-2 datasets
    
* The training and testing for each of the phases is seperated.  

* Fold selection and model selection could be done using 

```
modelfolder="../../RQ2.2/data/bert/fold1/"

X_train= pickle.load(open(modelfolder+'xtrain.pkl', 'rb'))
Y_train= pickle.load(open(modelfolder+'ytrain.pkl', 'rb'))
X_test= pickle.load(open(modelfolder+'xtest.pkl', 'rb'))
Y_test= pickle.load(open(modelfolder+'ytest.pkl', 'rb'))
```

* Since randomness impacts results, we search for randomness parameter using ```for rstate in [2]:```

In [None]:
import pickle

modelfolder="../../RQ2.2/data/bert/fold2/"

X_train= pickle.load(open(modelfolder+'xtrain.pkl', 'rb'))
Y_train= pickle.load(open(modelfolder+'ytrain.pkl', 'rb'))
X_test= pickle.load(open(modelfolder+'xtest.pkl', 'rb'))
Y_test= pickle.load(open(modelfolder+'ytest.pkl', 'rb'))

data=X_train
labels=np.array(Y_train)
# print(labels)

# Do K-Means
for rstate in [1]:
    
    print("Random State", rstate)
    
    print(82 * '_')
    print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    km1_km=bench_k_means(KMeans(init='k-means++', n_clusters=4, n_init=10,random_state=rstate), name="k-means++", data=data,labels=labels)

    km1_random=bench_k_means(KMeans(init='random', n_clusters=4, n_init=10,random_state=rstate), name="random", data=data,labels=labels)


    pca = PCA(n_components=4).fit(data) 
    km1_pca=bench_k_means(KMeans(init=pca.components_, n_clusters=4, n_init=1,random_state=rstate), name="PCA-based", data=data,labels=labels)
    print(82 * '_')


    print("kmeans++ Initialization")
    ypred=km1_km.predict(X_test)
    print(classification_report(np.array(Y_test),ypred,digits=5))

    print("Random Initialization")
    ypred=km1_random.predict(X_test)
    print(classification_report(np.array(Y_test),ypred,digits=5))


    print("PCA Components Initialization")
    ypred=km1_pca.predict(X_test)
    print(classification_report(np.array(Y_test),ypred,digits=5))