In [3]:
import os
import sys
import time
import re
import string
import unicodecsv as csv
import unicodedata as un
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

In [4]:
# Checking the raw file
file = open('./data/aclImdb/train/pos/0_9.txt')
print(file.read())

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


In [5]:
# Reading train/text files
pd_train = pd.read_csv('./data/aclImdb/train.csv')
pd_test = pd.read_csv('./data/aclImdb/test.csv')

In [44]:
pd_train.shape

(25000, 2)

In [6]:
pd_train.head()

Unnamed: 0,label,data
0,1,story of a man who has unnatural feelings for ...
1,1,airport starts as a brand new luxury plane ...
2,1,this film lacked something i couldn t put my f...
3,1,sorry everyone i know this is supposed to b...
4,1,when i was little my parents took me along to ...


In [7]:
train_token_count = 0
test_token_count = 0

for each_row in pd_train['data']:
    train_token_count += len(each_row)
    
for each_row in pd_test['data']:
    test_token_count += len(each_row)

print("Number of tokens in train = ", train_token_count)
print("Number of tokens in test = ", test_token_count)

Number of tokens in train =  32430009
Number of tokens in test =  31661792


In [8]:
print("Avg Number of tokens in each sentence in train = ", train_token_count/pd_train.shape[0])
print("Avg Number of tokens in each sentence in test = ", test_token_count/pd_test.shape[0])

Avg Number of tokens in each sentence in train =  1297.20036
Avg Number of tokens in each sentence in test =  1266.47168


In [9]:
from nltk.util import ngrams
from collections import Counter

# Putting all data into a list
train_tokens_list = pd_train.data.tolist()
test_tokens_list = pd_test.data.tolist()

In [10]:
# Testing if the token_list can be easily tokenized
train_tokens_list[10].split()[5]

'pant'

In [11]:
# Testing if the token_list can be easily tokenized
test_tokens_list[10].split()[5]

'gets'

In [12]:
tokens = []
for each in train_tokens_list:
    for i in each.split():
        tokens.append(i)

In [13]:
# Checking the most commong trigrams
text_trigrams = ngrams(tokens, 3)

Counter(text_trigrams).most_common(10)

[(('one', 'of', 'the'), 4941),
 (('i', 'don', 't'), 2705),
 (('this', 'movie', 'is'), 2674),
 (('of', 'the', 'film'), 2611),
 (('this', 'is', 'a'), 2379),
 (('it', 's', 'a'), 2376),
 (('a', 'lot', 'of'), 2276),
 (('of', 'the', 'movie'), 2182),
 (('some', 'of', 'the'), 1909),
 (('the', 'film', 'is'), 1872)]

In [14]:
# Checking the most commong trigrams
text_trigrams = ngrams(tokens, 4)

Counter(text_trigrams).most_common(10)

[(('is', 'one', 'of', 'the'), 1100),
 (('the', 'rest', 'of', 'the'), 1060),
 (('one', 'of', 'the', 'most'), 872),
 (('one', 'of', 'the', 'best'), 749),
 (('the', 'end', 'of', 'the'), 746),
 (('i', 'don', 't', 'know'), 703),
 (('this', 'is', 'one', 'of'), 623),
 (('i', 'have', 'ever', 'seen'), 616),
 (('i', 'don', 't', 'think'), 527),
 (('i', 've', 'ever', 'seen'), 521)]

In [43]:
# Checking the most commong trigrams
text_unigram = ngrams(tokens, 1)

c = Counter(text_unigram)

# Get total count of unique words
len(c)

74870

In [17]:
from nltk import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a tokenizer
tokenizer = TweetTokenizer()

In [18]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)

In [47]:
# Merge train and test list
full_text = train_tokens_list + test_tokens_list

In [48]:
unique_count = Counter(full_text)

In [20]:
vectorizer.fit(full_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x0000020F2C7D7C50>>,
        use_idf=True, vocabulary=None)

In [52]:
len(vectorizer.get_feature_names())

2342118

In [56]:
print(vectorizer.get_feature_names()[1:10])

['a a', 'a aa', 'a abandoned', 'a about', 'a absent', 'a absolute', 'a absolutely', 'a absolutley', 'a absurd']


In [58]:
# Shuffle the data
pd_train = pd_train.sample(frac=1).reset_index(drop=True)
pd_test = pd_test.sample(frac=1).reset_index(drop=True)

In [59]:
# Check the data after shuffled
pd_train.head()

Unnamed: 0,label,data
0,1,i began watching this movie with my girl frien...
1,1,yes mtv there really is a way to market daria...
2,2,really i think this movie is more an example ...
3,2,gbs wrote his own screen adaptation of this no...
4,1,visually speaking this film is stunning it h...


In [61]:
# Transform our input train/test data
train_vectorized = vectorizer.transform(pd_train.data)
test_vectorized = vectorizer.transform(pd_test.data)

In [62]:
# Check the shape of transformed vector
print(train_vectorized.shape)

(25000, 2342118)


In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

logreg = LogisticRegression()
ovr = OneVsRestClassifier(logreg)

In [None]:
%%time

ovr.fit(train_vectorized, pd_train.label.values)
acc = ovr.score(test_vectorized, pd_test.label.values)

In [136]:
print("Testing accuracy: {:2.3%}".format(acc))
# print(f"Testing accuracy: {acc}")

Testing accuracy: 88.616%


In [153]:
# Testing with random new review
review = ["This movie is fine", "Shah Rukh acting was well"]
review_vectorized = vectorizer.transform(review)
y_pred = ovr.predict(review_vectorized)

In [154]:
pred_review = "Positive"

for each_review, each_y_pred in zip(review, y_pred):
    if each_y_pred == 1:
        pred_review = "Negative"
    else:
        pred_review = "Positive"
    print("{0}: {1}".format(each_review, pred_review))

This movie is fine: Positive
Shah Rukh acting was well: Positive


In [74]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(ovr, train_vectorized, pd_train.label.values, scoring='accuracy', n_jobs=-1, cv=10)

In [76]:
import numpy as np
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 88.91%, std 0.63.


In [77]:
y_pred = cross_val_predict(ovr, X, y, cv=3)

[0.882  0.8896 0.8884 0.9036 0.8864 0.8824 0.8856 0.8876 0.8884 0.8972]


In [None]:
'''
    Try with different algorithms
        - LogReg/SVM/NB/DT/NN
        - CrossVal
    
    Report the accuracy of each algorithms
    
    
'''