In [1]:
%matplotlib inline 
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report

In [None]:
df=pd.read_csv("./data_kbai.tsv",delimiter="\t")

# Get X and Y

In [None]:
X=df.text[df.groundtruth!=0].values
Y=df.groundtruth[df.groundtruth!=0].values.astype(np.int32)

# Label Statistics

In [None]:
df.groundtruth.plot.hist()

# Grid Search and Cross Validation

In [None]:
# Y=labels
from sklearn.svm import SVC
from sklearn.model_selection import ShuffleSplit,KFold
from sklearn.model_selection import GridSearchCV

loo = ShuffleSplit(n_splits=2,test_size=0.4,random_state=43)

params=[]
ypreds=[]
train_indexes=[]
test_indexes=[]
ytests=[]
for train_index, test_index in loo.split(X,Y):
        parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]

        clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='log', penalty='l2',alpha=1e-3, max_iter=20, random_state=42,class_weight="balanced",warm_start=True))])

        gs_clf_svm = GridSearchCV(clf, parameters, n_jobs=-1)
        gs_clf_svm = gs_clf_svm.fit(X_train, y_train)

        ypred=gs_clf_svm.predict(X_test)
        print(classification_report(y_test,ypred,digits=5,))
        print(gs_clf_svm.best_score_)
        print(gs_clf_svm.best_params_)
        params.append(gs_clf_svm.best_params_)
        train_indexes.append(train_index)
        
        test_indexes.append(test_index)
        ypreds.append(ypred)

In [None]:
from sklearn.metrics import confusion_matrix
for i in range(len(ypreds)):
    print(params[i])
    print(confusion_matrix(Y[test_indexes[i]],ypreds[i]))
    print(X[test_indexes[i]])

# Classification Fold 1

In [3]:
from sklearn.dummy import DummyClassifier
traindata=pd.read_csv("./test1.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("./train1.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer(use_idf=True)),
                         ('clf', SGDClassifier(loss='log', penalty='l2',alpha=0.001, max_iter=100, random_state=42,class_weight="balanced",warm_start=True))])



clf.fit(X_train,Y_train)
ypred=clf.predict(X_test)

print(classification_report(Y_test,ypred,digits=5))



              precision    recall  f1-score   support

           1    0.70000   0.36207   0.47727        58
           2    0.61268   0.96667   0.75000        90
           3    0.50000   0.08333   0.14286        12
           4    0.50000   0.20833   0.29412        24

    accuracy                        0.61957       184
   macro avg    0.57817   0.40510   0.41606       184
weighted avg    0.61816   0.61957   0.56497       184



# Classification Fold 2

In [4]:
traindata=pd.read_csv("./train2.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("./test2.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

clf2 = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer(use_idf=True)),
                         ('clf', SGDClassifier(loss='log', penalty='l2',alpha=0.001, max_iter=100, random_state=42,class_weight="balanced",warm_start=True))])

clf2.fit(X_train,Y_train)
ypred=clf2.predict(X_test)

print(classification_report(Y_test,ypred,digits=5))


from collections import Counter
print(Counter(Y_test))
print(Counter(Y_train))


              precision    recall  f1-score   support

           1    0.58537   0.42857   0.49485        56
           2    0.59677   0.82222   0.69159        90
           3    0.00000   0.00000   0.00000        15
           4    0.31579   0.26087   0.28571        23

    accuracy                        0.56522       184
   macro avg    0.37448   0.37792   0.36804       184
weighted avg    0.50953   0.56522   0.52460       184

Counter({2: 90, 1: 56, 4: 23, 3: 15})
Counter({2: 140, 1: 71, 4: 43, 3: 22})


  _warn_prf(average, modifier, msg_start, len(result))


# Intepretability

In [None]:
import eli5
from eli5.lime import TextExplainer



In [13]:
import pickle
import eli5
from eli5.lime import TextExplainer
from xgboost import XGBClassifier

xtrain=pickle.load(open('./xtrain.pkl', 'rb'))
xtest=pickle.load(open('./xtest.pkl', 'rb'))

ytrain=pickle.load(open('./ytrain.pkl', 'rb'))
ytest=pickle.load(open('./ytest.pkl', 'rb'))




traindata=pd.read_csv("./test2.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("./train2.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)


xtraindict={}


for i,j in zip(X_train,xtrain):
    xtraindict[i]=j

for i,j in zip(X_test,xtest):
    xtraindict[i]=j

def returnfeatures(text):
    global xtraindict
    res=[]
    for i in text:
        res.append(xtraindict[i])
    return np.array(res)


clf11 = SGDClassifier(loss='log', penalty='l2',alpha=1e-3, max_iter=200, random_state=42,class_weight="balanced",warm_start=True)  #Pipeline([('custom', FunctionTransformer(returnfeatures)),('clf', SGDClassifier(loss='log', penalty='l2',alpha=1e-3, max_iter=200, random_state=42,class_weight="balanced",warm_start=True))])
clf11.fit(np.array(xtrain),np.array(ytrain))
ypred=clf11.predict(xtest)
print(classification_report(ytest,ypred,digits=5))


clf12=XGBClassifier(n_estimators=200,random_state=10,max_depth=3,learning_rate =0.1)
clf12.fit(np.array(xtrain),np.array(ytrain))
ypred=clf12.predict(xtest)
print(classification_report(ytest,ypred,digits=5))



              precision    recall  f1-score   support

           0    0.65385   0.60714   0.62963        56
           1    0.62992   0.88889   0.73733        90
           2    0.00000   0.00000   0.00000        15
           3    0.66667   0.08696   0.15385        23

    accuracy                        0.63043       184
   macro avg    0.48761   0.39575   0.38020       184
weighted avg    0.59044   0.63043   0.57151       184

              precision    recall  f1-score   support

           0    0.55263   0.37500   0.44681        56
           1    0.59690   0.85556   0.70320        90
           2    0.50000   0.06667   0.11765        15
           3    0.33333   0.21739   0.26316        23

    accuracy                        0.56522       184
   macro avg    0.49572   0.37865   0.38270       184
weighted avg    0.54258   0.56522   0.52243       184



In [None]:
from IPython.display import display
te = TextExplainer(random_state=42)
traindata=pd.read_csv("./test2.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

X_test=np.array(X_test)
for i in range(len(X_test)):
    print(X_test[i])
    te.fit(X_test[i], clf11.predict_proba)
    a=te.show_prediction(target_names=[1,2,3,4])
    display(a)