In [4]:
'''
    Author - Oyesh Mann Singh
    Date - 11/05/2018
    Description 
        - Practising NLP
        - Text preprocessing
        - Analyzing various ML algorithms
'''

import os
import sys
import time
import re
import string
import unicodecsv as csv
import unicodedata as un
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

In [5]:
# Checking the raw file
file = open('./data/aclImdb/train/pos/0_9.txt')
print(file.read())

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


In [6]:
# Reading train/text files
pd_train = pd.read_csv('./data/aclImdb/train.csv')
pd_test = pd.read_csv('./data/aclImdb/test.csv')

In [7]:
pd_train.shape

(25000, 2)

In [8]:
pd_train.head()

Unnamed: 0,label,data
0,1,story of a man who has unnatural feelings for ...
1,1,airport starts as a brand new luxury plane ...
2,1,this film lacked something i couldn t put my f...
3,1,sorry everyone i know this is supposed to b...
4,1,when i was little my parents took me along to ...


In [9]:
train_token_count = 0
test_token_count = 0

for each_row in pd_train['data']:
    train_token_count += len(each_row)
    
for each_row in pd_test['data']:
    test_token_count += len(each_row)

print("Number of tokens in train = ", train_token_count)
print("Number of tokens in test = ", test_token_count)

Number of tokens in train =  32430009
Number of tokens in test =  31661792


In [10]:
print("Avg Number of tokens in each sentence in train = ", train_token_count/pd_train.shape[0])
print("Avg Number of tokens in each sentence in test = ", test_token_count/pd_test.shape[0])

Avg Number of tokens in each sentence in train =  1297.20036
Avg Number of tokens in each sentence in test =  1266.47168


In [11]:
from nltk.util import ngrams
from collections import Counter

# Putting all data into a list
train_tokens_list = pd_train.data.tolist()
test_tokens_list = pd_test.data.tolist()

In [12]:
# Testing if the token_list can be easily tokenized
train_tokens_list[10].split()[5]

'pant'

In [13]:
# Testing if the token_list can be easily tokenized
test_tokens_list[10].split()[5]

'gets'

In [14]:
tokens = []
for each in train_tokens_list:
    for i in each.split():
        tokens.append(i)

In [15]:
# Checking the most commong trigrams
text_trigrams = ngrams(tokens, 3)

Counter(text_trigrams).most_common(10)

[(('one', 'of', 'the'), 4941),
 (('i', 'don', 't'), 2705),
 (('this', 'movie', 'is'), 2674),
 (('of', 'the', 'film'), 2611),
 (('this', 'is', 'a'), 2379),
 (('it', 's', 'a'), 2376),
 (('a', 'lot', 'of'), 2276),
 (('of', 'the', 'movie'), 2182),
 (('some', 'of', 'the'), 1909),
 (('the', 'film', 'is'), 1872)]

In [16]:
# Checking the most commong trigrams
text_trigrams = ngrams(tokens, 4)

Counter(text_trigrams).most_common(10)

[(('is', 'one', 'of', 'the'), 1100),
 (('the', 'rest', 'of', 'the'), 1060),
 (('one', 'of', 'the', 'most'), 872),
 (('one', 'of', 'the', 'best'), 749),
 (('the', 'end', 'of', 'the'), 746),
 (('i', 'don', 't', 'know'), 703),
 (('this', 'is', 'one', 'of'), 623),
 (('i', 'have', 'ever', 'seen'), 616),
 (('i', 'don', 't', 'think'), 527),
 (('i', 've', 'ever', 'seen'), 521)]

In [17]:
# Checking the most commong trigrams
text_unigram = ngrams(tokens, 1)

c = Counter(text_unigram)

# Get total count of unique words
len(c)

74870

In [18]:
from nltk import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a tokenizer
tokenizer = TweetTokenizer()

In [19]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)

In [20]:
# Merge train and test list
full_text = train_tokens_list + test_tokens_list

In [21]:
unique_count = Counter(full_text)

In [22]:
vectorizer.fit(full_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x0000026F390F8BA8>>,
        use_idf=True, vocabulary=None)

In [23]:
len(vectorizer.get_feature_names())

2342118

In [24]:
print(vectorizer.get_feature_names()[1:10])

['a a', 'a aa', 'a abandoned', 'a about', 'a absent', 'a absolute', 'a absolutely', 'a absolutley', 'a absurd']


In [25]:
# Shuffle the data
pd_train = pd_train.sample(frac=1).reset_index(drop=True)
pd_test = pd_test.sample(frac=1).reset_index(drop=True)

In [26]:
# Check the data after shuffled
pd_train.head()

Unnamed: 0,label,data
0,2,gene tierney and dana andrews who were both s...
1,2,one of the best amitabh comeback movies i li...
2,1,star rating the works just misses ...
3,2,its about time that gunga din is released on d...
4,2,trapped buried alive brings us to a resort th...


In [27]:
# Transform our input train/test data
train_vectorized = vectorizer.transform(pd_train.data)
test_vectorized = vectorizer.transform(pd_test.data)

In [28]:
# Check the shape of transformed vector
print(train_vectorized.shape)

(25000, 2342118)


In [85]:
# Testing with random new review
review = ["This movie is fine", "The movie is not fine"]

# Print the prediction of new reviews
def print_prediction(review, est):
    review_vectorized = vectorizer.transform(review)
    y_pred = est.predict(review_vectorized)
    for each_review, each_y_pred in zip(review, y_pred):
        if each_y_pred == 1:
            pred_review = "Negative"
        else:
            pred_review = "Positive"
        print("{0}: {1}".format(each_review, pred_review))
        
        
def print_score(est, acc):
    print("Accuracy of {0}: {1:2.3%}".format(est, acc))

# MACHINE LEARNING SECTION STARTS

In [29]:
%%time
'''
    Logistic Regression
'''
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

logreg = LogisticRegression()
ovr = OneVsRestClassifier(logreg)

ovr.fit(train_vectorized, pd_train.label.values)
ovr_acc = ovr.score(test_vectorized, pd_test.label.values)

Wall time: 7.13 s


In [30]:
print("One vs Rest Logisitic Regression accuracy = ", ovr_acc)

One vs Rest Logisitic Regression accuracy =  0.88616


In [33]:
print_prediction(review, ovr)

This movie is fine: Positive
The movie is not fine: Negative


In [34]:
%%time
'''
    Cross Validation Logistic Regression
'''
from sklearn.model_selection import cross_validate, cross_val_score
import numpy as np

cv_ovr = LogisticRegression()

cv_ovr = cross_validate(cv_ovr, train_vectorized, pd_train.label.values, scoring='accuracy', n_jobs=-1, cv=10)

Wall time: 47 s


In [35]:
print(cv_ovr['test_score'])

[0.8768 0.8872 0.8908 0.8928 0.894  0.886  0.8932 0.8904 0.8828 0.894 ]


In [36]:
print('Cross-validation mean (test) accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(cv_ovr['test_score']) * 100, np.std(cv_ovr['test_score']) * 100))

Cross-validation mean (test) accuracy 88.88%, std 0.54.


In [92]:
%%time

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

ovr = LogisticRegression()

param_grid = {'C': [1, 10, 100, 1000] }

ovr_grid = GridSearchCV(ovr, param_grid=param_grid, cv=5, refit=True, verbose=1)

ovr_grid_fit = ovr_grid.fit(train_vectorized, pd_train.label.values)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  4.9min finished


Wall time: 5min 16s


In [95]:
ovr_grid_score = ovr_grid_fit.score(test_vectorized, pd_test.label.values)
print_acc(acc=ovr_grid_score, est='Logistic Regression CV')

Accuracy of Logistic Regression CV: 90.384%


In [96]:
# Testing with random new review
print_prediction(review, ovr_grid_fit)

Movie was good: Positive
There was normal acting: Negative


In [38]:
%%time
'''
    SVM
'''

from sklearn.svm import SVC

my_svm = SVC()

my_svm.fit(train_vectorized, pd_train.label.values)

svm_acc = my_svm.score(test_vectorized, pd_test.label.values)
print('SVM accuracy', svm_acc)

SVM accuracy 0.6966
Wall time: 1h 2min 31s


In [39]:
'''
    SVM accuracy 0.6966
    Wall time: 1h 2min 31s
'''

# Evaluate the trained algorithm with new data
print_prediction(review, my_svm)

This movie is fine: Positive
The movie is not fine: Positive


In [None]:
%%time
'''
    Grid Search CV SVM
'''
from sklearn.model_selection import GridSearchCV

svm_grid = SVC()

param_grid = {'C': [1, 10, 100, 1000], 'gamma': [1e-3, 1e-4]}

svm_grid = GridSearchCV(svm_grid, param_grid=param_grid, cv=5, refit=True, verbose=1)

svm_grid_fit = svm_grid.fit(train_vectorized, pd_train.label.values)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [89]:
%%time
'''
    Naive Bayes MultinomialNB
'''

from sklearn.naive_bayes import MultinomialNB, BernoulliNB 

nb_clf = MultinomialNB()
nb_clf.fit(X=train_vectorized, y=pd_train.label.values)
nb_score = nb_clf.score(test_vectorized, pd_test.label.values)

print_acc('MultinomialNB', nb_score)

review = ['Movie was good', 'There was normal acting']
print_prediction(review, nb_clf)

Accuracy of MultinomialNB: 86.336%
Movie was good: Negative
There was normal acting: Negative
Wall time: 2.12 s


In [90]:
%%time
'''
    Naive Bayes BernoulliNB
'''

nb_bern_clf = BernoulliNB()
nb_bern_clf.fit(X=train_vectorized, y=pd_train.label.values)
nb_bern_score = nb_bern_clf.score(test_vectorized, pd_test.label.values)

print_acc('MultinomialNB', nb_bern_score)

print_prediction(review, nb_bern_clf)

Accuracy of MultinomialNB: 85.084%
Movie was good: Negative
There was normal acting: Negative
Wall time: 1.25 s


In [97]:
%%time
'''
    KNN Classifier
'''

from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf.fit(X=train_vectorized, y=pd_train.label.values)
knn_score = knn_clf.score(test_vectorized, pd_test.label.values)

print_acc('KNN Classifier', knn_score)

print_prediction(review, knn_clf)

Accuracy of KNN Classifier: 68.012%
Movie was good: Positive
There was normal acting: Negative
Wall time: 5min 32s


In [None]:
'''
    - Perceptron (NN)
    - Ensemble
    - PCA
'''