In [80]:
from IPython.display import clear_output, HTML
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import zipfile
from bs4 import BeautifulSoup
import re
import nltk

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
#nltk.download()
from nltk.corpus import stopwords

### Data preprocessing

In [7]:
# download train data (Kaggle)
with zipfile.ZipFile('labeledTrainData.tsv.zip') as file:
    with file.open(file.namelist()[0]) as content:
        train = pd.read_csv(content, header=0, delimiter="\t", quoting=3)

In [10]:
def review_to_words(raw_review, remove_stopwords=False):
    '''turning reviews into list of words'''
    # parsing html
    review_text = BeautifulSoup(raw_review,'lxml').get_text()
    # leaving only letters
    letters_only=re.sub('[^a-zA-Z]'," ",review_text)
    words = letters_only.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english")) 
        words =  [w for w in words if w not in stops]
    return " ".join(words)

In [20]:
# Example
# raw input
print(train.review[5])

"I dont know why people think this is such a bad movie. Its got a pretty good plot, some good action, and the change of location for Harry does not hurt either. Sure some of its offensive and gratuitous but this is not the only movie like that. Eastwood is in good form as Dirty Harry, and I liked Pat Hingle in this movie as the small town cop. If you liked DIRTY HARRY, then you should see this one, its a lot better than THE DEAD POOL. 4/5"


In [21]:
# processed input
print(review_to_words(train.review[5]))

i dont know why people think this is such a bad movie its got a pretty good plot some good action and the change of location for harry does not hurt either sure some of its offensive and gratuitous but this is not the only movie like that eastwood is in good form as dirty harry and i liked pat hingle in this movie as the small town cop if you liked dirty harry then you should see this one its a lot better than the dead pool


In [22]:
# make a list of clean reviews
num_reviews = train.shape[0]
clean_reviews = []

for i in range(num_reviews):
    if (i+1) % 5000 == 0:
        clear_output()
        print("Review {0:d} of {1:d}".format(i+1,num_reviews))
    clean_reviews.append(review_to_words(train.review[i]))

Review 25000 of 25000


In [23]:
print(clean_reviews[5])

i dont know why people think this is such a bad movie its got a pretty good plot some good action and the change of location for harry does not hurt either sure some of its offensive and gratuitous but this is not the only movie like that eastwood is in good form as dirty harry and i liked pat hingle in this movie as the small town cop if you liked dirty harry then you should see this one its a lot better than the dead pool


### Creating Bag of Words

In [26]:
#CountVectorizer converts a collection of text documents to a matrix of token counts
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)

train_data_features = vectorizer.fit_transform(clean_reviews).toarray()

In [30]:
# number of reviews by vocabluary size
print(train_data_features.shape)

(25000, 5000)


In [33]:
# get vocabluary
vocab = vectorizer.get_feature_names()
dist  = np.sum(train_data_features,axis=0)
print('Vocab size =', len(vocab), len(vocab)==len(dist))

Vocab size = 5000 True


In [208]:
# counts number of times each word encounters in the text
#for tag,count in zip(vocab,dist):
#    print("{0:12s}: {1:d}".format(tag, count))

In [44]:
X = train_data_features
y = train.sentiment

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Modelling 1: Random Forrests (Sklearn)

In [206]:
%%time
forest = RandomForestClassifier(n_estimators=100,oob_score=True,n_jobs=-1)
forest.fit(X_train,y_train)

Wall time: 29.3 s


In [207]:
p_test = forest.predict_proba(X_test)[:,1]
p_oob  = forest.oob_decision_function_[:,1]
print('AUC on OOB data:  ',metrics.roc_auc_score(y_train,p_oob))
print('AUC on test data: ',metrics.roc_auc_score(y_test,p_test))

AUC on OOB data:   0.901645449592
AUC on test data:  0.91476558905


### Modelling 2: Linear Neural Network (PyTorch)

In [203]:
# build a model
class BOW_classifier(nn.Module):
    def __init__(self,num_labels,vocab_size):
        super(BOW_classifier,self).__init__()
        self.linear1 = nn.Linear(vocab_size,100)
        self.linear2 = nn.Linear(100,20)
        self.linear3 = nn.Linear(20,num_labels)
    
    def forward(self,bow_vec):
        out1 = F.relu(self.linear1(bow_vec))
        out2 = F.relu(self.linear2(out1))
        return F.log_softmax(self.linear3(out2))
    
model = BOW_classifier(2,vocab_size=5000)

In [204]:
%%time

batch_size = 200
num_batches = X_train.shape[0]//batch_size

loss_fun = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(),lr=0.1)

for epoch in range(50):
    for i in range(num_batches):
        k = i*batch_size
        model.zero_grad()
        batch  = Variable(torch.from_numpy(X_train[k:k+batch_size,:]).type(torch.FloatTensor))
        target = Variable(torch.from_numpy(y_train.values[k:k+batch_size]))
        
        log_probs = model(batch)
        loss = loss_fun(log_probs,target)
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        print(epoch, loss.data.numpy())

0 [ 0.56808984]
10 [ 0.36756164]
20 [ 0.32868358]
30 [ 0.20889403]
40 [ 0.20131597]
Wall time: 46.1 s


In [205]:
p_test  = torch.exp(model(Variable(torch.from_numpy(X_test).type(torch.FloatTensor))).data[:,1]).numpy()
p_train = torch.exp(model(Variable(torch.from_numpy(X_train).type(torch.FloatTensor))).data[:,1]).numpy()

print('AUC on train data: ',metrics.roc_auc_score(y_train,p_train))
print('AUC on test data:  ',metrics.roc_auc_score(y_test,p_test))

AUC on train data:  0.989408156187
AUC on test data:   0.939915973916
