# Transparency ML in Text Classification using IMDB Reviews (large) datasets

The source of this dataset can be accessed through the following link: http://ai.stanford.edu/~amaas/data/sentiment/
This dataset provides data for train and test. Each of dataset contains 25,000 instances.

In [5]:
############ Compulsory Standard Library #################
import glob
import numpy as np
import matplotlib.pyplot as plt
import csv

############ Utility Library #################
from prettytable import PrettyTable
import xlsxwriter
import time
import graphviz

############ Sklearn pre-processing Library #################
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split, ShuffleSplit
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics.pairwise import cosine_similarity

############ Sklearn model Library #################
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV

### Load the dataset

In [6]:
def load_imdb(path):
    
    print("Loading the imdb data")
    
    train_neg_files = glob.glob(path+"/train/neg/*.txt")
    train_pos_files = glob.glob(path+"/train/pos/*.txt")
    
    X_train_corpus = []
    y_train = []
    
    for tnf in train_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(0)
        f.close()
    
    for tpf in train_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(1)
        f.close()
    
    print("Train Data loaded.")
    
    test_neg_files = glob.glob(path+"/test/neg/*.txt")
    test_pos_files = glob.glob(path+"/test/pos/*.txt")
    
    X_test_corpus = []
    y_test = []
    
    for tnf in test_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(0)
        f.close()
    
    for tpf in test_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(1)
        f.close()
    
    print("Test Data loaded.")
    
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    return X_train_corpus, y_train, X_test_corpus , y_test

In [9]:
X_train_corpus , y_train, X_test_corpus , y_test = load_imdb('../../aclImdb')

Loading the imdb data
Train Data loaded.
Test Data loaded.


### Binarize the Dataset

In [10]:
y_train = np.array(y_train)
y_test = np.array(y_test)

tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=5, binary=True, ngram_range=(2,2))

print('Data Vectorizer Transform start')
print()
X_train = tf_vectorizer.fit_transform(X_train_corpus)

# print('Train Data Transformed')
# print('Train Data size ', X_train.shape)
# print()
# X_test = tf_vectorizer.transform(X_test_corpus)
# print('Test Data Transformed')
# print('Test Data size ', X_test.shape)

Data Vectorizer Transform start



In [11]:
words = tf_vectorizer.get_feature_names()
len(tf_vectorizer.vocabulary_)

129549

In [None]:
# tf_vectorizer.stop_words_

#### "Not" and "Nothing"

In [12]:
voc_nothing = [s for s in words if "nothing" in s]
print(len(voc_nothing))
# print(voc_nothing)

with open('nothing-2grams.txt', mode='w', encoding='utf8') as w:
        for i in voc_nothing:
            w.write(i)
            w.write('\n')
        w.close()

203


In [13]:
voc_never = [s for s in words if "never" in s]
print(len(voc_never))

with open('never-2grams.txt', mode='w', encoding='utf8') as w:
        for i in voc_never:
            w.write(i)
            w.write('\n')
        w.close()

343


In [14]:
voc_not = [s for s in words if "not " in s]
print(len(voc_not))

with open('not-2grams.txt', mode='w', encoding='utf8') as w:
        for i in voc_not:
            w.write(i)
            w.write('\n')
        w.close()

663


In [15]:
voc_slightly = [s for s in words if "slightly" in s]
print(len(voc_slightly))

with open('slightly-2grams.txt', mode='w', encoding='utf8') as w:
        for i in voc_slightly:
            w.write(i)
            w.write('\n')
        w.close()

29


In [16]:
voc_may = [s for s in words if "may " in s]
print(len(voc_may))

with open('may-2grams.txt', mode='w', encoding='utf8') as w:
        for i in voc_may:
            w.write(i)
            w.write('\n')
        w.close()

68


In [17]:
voc_maybe = [s for s in words if "maybe" in s]
print(len(voc_maybe))

with open('maybe-2grams.txt', mode='w', encoding='utf8') as w:
        for i in voc_maybe:
            w.write(i)
            w.write('\n')
        w.close()

128


In [18]:
voc_always = [s for s in words if "always" in s]
print(len(voc_always))

with open('always-2grams.txt', mode='w', encoding='utf8') as w:
        for i in voc_always:
            w.write(i)
            w.write('\n')
        w.close()

175


In [19]:
voc_almost = [s for s in words if "almost" in s]
print(len(voc_almost))

with open('almost-2grams.txt', mode='w', encoding='utf8') as w:
        for i in voc_almost:
            w.write(i)
            w.write('\n')
        w.close()

187


In [20]:
vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=5, binary=True, ngram_range=(2,2), vocabulary=voc_nothing)

X_train = vectorizer.fit_transform(X_train_corpus)
X_test = vectorizer.transform(X_test_corpus)

words = vectorizer.get_feature_names()

### Test the Data

In [None]:
clf_l2 = LogisticRegression(penalty='l2', C=0.5)
clf_l2.fit(X_train, y_train)
print(clf_noz_l2)

In [None]:
y_pred_1 = clf_l2.predict(X_train)
print(accuracy_score(y_pred_1, y_train))
y_pred_2 = clf_l2.predict(X_test)
print(accuracy_score(y_pred_2, y_test))

In [None]:
clf_l1 = LogisticRegression(penalty='l1')
clf_l1.fit(X_train, y_train)

In [None]:
y_pred_1 = clf_l1.predict(X_train)
print(accuracy_score(y_pred_1, y_train))
y_pred_2 = clf_l1.predict(X_test)
print(accuracy_score(y_pred_2, y_test))

In [None]:
def print_features(clf_L1, clf_L2, f_names, msg, iter_range=10):
    idx_L2 = np.argsort(np.absolute(clf_L2.coef_)[0,:])[::-1]
    idx_L1 = np.argsort(np.absolute(clf_L1.coef_)[0,:])[::-1]
    
    ### Print on Pretty Table
    table_features = PrettyTable(['Rank', 'L2 Features', 'L2 Weight', 'L1 Features', 'L1 Weight'])
    f_list_1 = np.zeros(iter_range, dtype='int16')
    f_list_2 = np.ones(iter_range, dtype='int16')
    
    f_list = []
    
    for idx in range(0,iter_range):
        table_features.add_row([idx+1, 
                                words[idx_L2[idx]], 
                                np.around(clf_L2.coef_[0,idx_L2[idx]], decimals=4), 
                                words[idx_L1[idx]], 
                                np.around(clf_L1.coef_[0,idx_L1[idx]], decimals=4)])
        f_list_2[idx] = idx_L2[idx]
        f_list_1[idx] = idx_L1[idx]
        
    print('L2 and L1-regularized Logistic Regression Classifier', msg)
    print('Top 10 features and weights (with absolute value)')
    print()
    print(table_features)
    print(' ')
    print('List of features in both L1 and L2 penalty :')
    num=1
    for i in range(0,iter_range):
        for j in range(0,iter_range):
            if f_list_1[i] == f_list_2[j]:
                f_list.append(words[f_list_1[i]])
                #print('\t', num, f_names[f_list_1[i]])
                num += 1
                
    for v in f_list:
        print(v)
    print(num-1, 'Similar Features')
    
    return f_list

In [None]:
f_noz_list = print_features(clf_l1, clf_l2, f_names, 'without Z-score scaling', iter_range=20)

In [None]:

freq = np.sum(X_train, axis=0)
freq = freq.A1
weights = clf_l2.coef_[0]
fi = np.argsort(np.absolute(freq))

[(words[i], freq[i], weights[i]) for i in fi[::-1]]

In [None]:
from termcolor import colored
print(colored('hello', 'red'), colored('world', 'green'))
print(colored("hello red world", 'red'))