# Import Libraries

In [None]:
%matplotlib inline 
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.dummy import DummyClassifier

# Classification Fold 1

* This code runs SVM with TF-IDF model for classification of keyphrases
    
* The training and testing for each of the phases is seperated.  

* This can be modified using line ```traindata=pd.read_csv("../data/test1.tsv",delimiter="\t")``` and ```traindata=pd.read_csv("../data/train1.tsv",delimiter="\t")```

* Also to run for SVM with Bag of words use ```clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('clf', SGDClassifier(loss='log', penalty='l2',alpha=0.001, max_iter=100, random_state=42,class_weight="balanced",warm_start=True))])```


In [None]:

traindata=pd.read_csv("../data/tsv/test1.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("../data/tsv/train1.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer(use_idf=True)),
                         ('clf', SGDClassifier(loss='log', penalty='l2',alpha=0.001, max_iter=100, random_state=42,class_weight="balanced",warm_start=True))])



clf.fit(X_train,Y_train)
ypred=clf.predict(X_test)

print(classification_report(Y_test,ypred,digits=5))



# Classification Fold 2

* This code runs SVM with TF-IDF model for classification of keyphrases
    
* The training and testing for each of the phases is seperated.  

* This can be modified using line ```traindata=pd.read_csv("../data/test1.tsv",delimiter="\t")``` and ```traindata=pd.read_csv("../data/train1.tsv",delimiter="\t")```

* Also to run for SVM with Bag of words use ```clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('clf', SGDClassifier(loss='log', penalty='l2',alpha=0.001, max_iter=100, random_state=42,class_weight="balanced",warm_start=True))])```


In [None]:
traindata=pd.read_csv("../data/tsv/train2.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("../data/tsv/test2.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

clf2 = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer(use_idf=True)),
                         ('clf', SGDClassifier(loss='log', penalty='l2',alpha=0.001, max_iter=100, random_state=42,class_weight="balanced",warm_start=True))])

clf2.fit(X_train,Y_train)
ypred=clf2.predict(X_test)

print(classification_report(Y_test,ypred,digits=5))


# Intepretability

* This code evaluates previously trained model for interpretability using *eli5* library
    
* The training and testing for each of the phases can be modified using line ```xtrain=pickle.load(open('../data/summary_pkl/xtrain.pkl', 'rb'))```
```xtest=pickle.load(open('../data/summary_pkl/xtest.pkl', 'rb'))```
```ytrain=pickle.load(open('../data/summary_pkl/ytrain.pkl', 'rb'))```
```ytest=pickle.load(open('../data/summary_pkl/ytest.pkl', 'rb'))```

* Following function returns new features for LIME generated texts
```
def returnfeatures(text):
    global xtraindict
    res=[]
    for i in text:
        res.append(xtraindict[i])
    return np.array(res)
```

In [None]:
import eli5
from eli5.lime import TextExplainer

In [None]:
import pickle
import eli5
from eli5.lime import TextExplainer
from xgboost import XGBClassifier

xtrain=pickle.load(open('../data/summary_pkl/xtrain.pkl', 'rb'))
xtest=pickle.load(open('../data/summary_pkl/xtest.pkl', 'rb'))

ytrain=pickle.load(open('../data/summary_pkl/ytrain.pkl', 'rb'))
ytest=pickle.load(open('../data/summary_pkl/ytest.pkl', 'rb'))


traindata=pd.read_csv("../data/tsv/test2.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("../data/tsv/train2.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)


xtraindict={}


for i,j in zip(X_train,xtrain):
    xtraindict[i]=j

for i,j in zip(X_test,xtest):
    xtraindict[i]=j

def returnfeatures(text):
    global xtraindict
    res=[]
    for i in text:
        res.append(xtraindict[i])
    return np.array(res)


clf11 = SGDClassifier(loss='log', penalty='l2',alpha=1e-3, max_iter=200, random_state=42,class_weight="balanced",warm_start=True)  #Pipeline([('custom', FunctionTransformer(returnfeatures)),('clf', SGDClassifier(loss='log', penalty='l2',alpha=1e-3, max_iter=200, random_state=42,class_weight="balanced",warm_start=True))])
clf11.fit(np.array(xtrain),np.array(ytrain))
ypred=clf11.predict(xtest)
print(classification_report(ytest,ypred,digits=5))


clf12=XGBClassifier(n_estimators=200,random_state=10,max_depth=3,learning_rate =0.1)
clf12.fit(np.array(xtrain),np.array(ytrain))
ypred=clf12.predict(xtest)
print(classification_report(ytest,ypred,digits=5))



# Display Interpretablity Results

* Change testfile in ``` traindata=pd.read_csv("../data/tsv/test2.tsv",delimiter="\t")```

In [None]:
from IPython.display import display
te = TextExplainer(random_state=42)

#Change file names here
traindata=pd.read_csv("../data/tsv/test2.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

X_test=np.array(X_test)
for i in range(len(X_test)):
    print(X_test[i])
    te.fit(X_test[i], clf11.predict_proba)
    a=te.show_prediction(target_names=[1,2,3,4])
    display(a)