In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import jsonlines
import json
import re

from random import randrange

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from itertools import chain
from scipy import sparse
%matplotlib inline
seed = 42

In [None]:
path = './snli_1.0/'
train_file = path + 'snli_1.0_train.jsonl'
test_file = path + 'snli_1.0_test.jsonl'

In [None]:
# Load train data
train = []
with jsonlines.open(train_file) as f:
    for line in f.iter():
        train.append(line)
train = pd.DataFrame(train)
test = []
with jsonlines.open(test_file) as f:
    for line in f.iter():
        test.append(line)
test = pd.DataFrame(test)

In [None]:
rdm = randrange(len(train))
print("Premise: " + train['sentence1'][rdm])
print("Hypothesis: " + train['sentence2'][rdm])
print("Gold Label: " + train['gold_label'][rdm])

In [None]:
print('Number of rows in the train data set:', len(train))
print('Number of rows in the test data set:', len(test))

In [None]:
sns.countplot(test['gold_label'],palette='magma')

In [None]:
# Remove samples without gold_label
train = train[train.gold_label != "-"]
test = test[test.gold_label != "-"]

In [None]:
print('Number of rows in the train data set:', len(train))
print('Number of rows in the test data set:', len(test))

In [None]:
# splitting train and validate sets
from sklearn.model_selection import train_test_split
train, validate = train_test_split(train, test_size=0.02, shuffle=True)
print('Number of rows in the train data set:', len(train))
print('Number of rows in the validate data set:', len(validate))

In [None]:
sns.countplot(train['gold_label'],palette='magma')

In [None]:
sns.countplot(validate['gold_label'],palette='magma')

In [None]:
sns.countplot(test['gold_label'], palette='magma')

In [None]:
# For preprocessing
nltk.download('stopwords')
nltk.download('punkt')
STOP_WORDS = nltk.corpus.stopwords.words('english') + list(string.punctuation)
snowBallStemmer = SnowballStemmer("english")
print(len(STOP_WORDS))
#print(STOP_WORDS)

In [None]:
# Text Preprocessing
def preprocess(sentence):
    # Tokenize
    sentence = sentence.replace('\n', ' ').replace('\t', ' ').lower()
    sentence = re.sub('[^a-z ]', ' ', sentence)
    tokens = sentence.split(' ')
    tokens = [token for token in tokens if len(token) > 0]
    # Remove stop words
    #words = [i for i in tokens if i not in STOP_WORDS]
    words = [i for i in tokens if len(i) > 0]
    # Stemming
    stemWords = [snowBallStemmer.stem(word) for word in words]
    return stemWords

In [None]:
tfidf = TfidfVectorizer(tokenizer=preprocess)
tfidf_train = tfidf.fit_transform(list(chain(*zip(train['sentence1'], train['sentence2']))))

In [None]:
feature_names = tfidf.get_feature_names()
len(feature_names) #24672 #21456  #20734

In [None]:
tfidf_validate = tfidf.transform(list(chain(*zip(validate['sentence1'], validate['sentence2']))))

In [None]:
rdm = randrange(len(validate))

print(list(validate['sentence1'])[rdm])
rows = tfidf_validate.nonzero()[0]
cols = tfidf_validate.nonzero()[1]
for i in range(len(rows)):
    if(rows[i] == 2*rdm):
        print(feature_names[cols[i]], ' - ', tfidf_validate[rows[i], cols[i]])

In [None]:
X_train = tfidf_train.reshape((len(train), 2*len(feature_names)))
y_train = list(train['gold_label'])

In [None]:
X_train.shape

In [None]:
X_validate = tfidf_validate.reshape((len(validate), 2*len(feature_names)))
y_validate = list(validate['gold_label'])

In [None]:
X_validate.shape

In [None]:
rdm = randrange(len(train))
print(list(train['sentence1'])[rdm])
print(list(train['sentence2'])[rdm])
print(list(train['gold_label'])[rdm])
print(y_train[rdm])

# Logistic regression

In [None]:
def plot_cm(cm, labels):
    plt.figure(figsize=(9,9))
    sns.heatmap(cm, annot=True, vmin=0, vmax=max(np.sum(cm, axis=1)), 
                fmt=".3f", linewidths=.5, square = True, cmap='plasma',
                xticklabels=labels, yticklabels=labels);
    plt.ylabel('Actual label');
    plt.xlabel('Predicted label');
    all_sample_title = 'Accuracy Score: {0}'.format(np.trace(cm)/np.sum(cm))
    plt.title(all_sample_title, size = 15);

In [None]:
log_reg = LogisticRegression(C=10, n_jobs=-1, max_iter=500)
log_reg.fit(X_train, y_train)

In [None]:
log_reg.classes_

In [None]:
log_reg.coef_.shape

In [None]:
log_reg.intercept_

In [None]:
log_reg.n_iter_

In [None]:
# Train score
score = log_reg.score(X_train, y_train)
print(score)

In [None]:
# Validate score
score = log_reg.score(X_validate, y_validate)
print(score)

In [None]:
# Train cm
predictions = log_reg.predict(X_train)
cm = metrics.confusion_matrix(y_train, predictions)
print(cm)
plot_cm(cm, log_reg.classes_)

In [None]:
# Validate cm
predictions = log_reg.predict(X_validate)
cm = metrics.confusion_matrix(y_validate, predictions)
print(cm)
plot_cm(cm, log_reg.classes_)