In [6]:
import torch
import torch.nn as nn
from torchtext.datasets import IMDB
from torch.utils.data.dataset import random_split
import os
import sys
import tarfile
import time
import urllib.request

In [7]:


source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
target = 'aclImdb_v1.tar.gz'

if os.path.exists(target):
    os.remove(target)

def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
        return
    duration = time.time() - start_time
    progress_size = int(count * block_size)
    speed = progress_size / (1024.**2 * duration+0.01)
    percent = count * block_size * 100. / total_size

    sys.stdout.write(f'\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB '
                     f'| {speed:.2f} MB/s | {duration:.2f} sec elapsed')
    sys.stdout.flush()


if not os.path.isdir('aclImdb') and not os.path.isfile('aclImdb_v1.tar.gz'):
    urllib.request.urlretrieve(source, target, reporthook)

100% | 80.23 MB | 3.66 MB/s | 21.93 sec elapsed

In [8]:
if not os.path.isdir('aclImdb'):

    with tarfile.open(target, 'r:gz') as tar:
        tar.extractall()

In [9]:
!pip install pyprind



In [10]:
import pyprind
import pandas as pd
import os
import sys


In [11]:
basepath = 'aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000, stream=sys.stdout)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], 
                           ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

In [12]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
print(df.shape)
df.head()

(50000, 2)


Unnamed: 0,review,sentiment
11841,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
19602,OK... so... I really like Kris Kristofferson a...,0
45519,"***SPOILER*** Do not read this, if you think a...",0
25747,hi for all the people who have seen this wonde...,1
42642,"I recently bought the DVD, forgetting just how...",0


In [13]:
df.reset_index(inplace=True)
print('Number of positive/negative reviews are {}/{}'.format(df.sentiment.sum(), df.shape[0]-df.sentiment.sum()))

Number of positive/negative reviews are 25000/25000


In [14]:
df.drop(columns=['index'], inplace=True)
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [15]:
for i in range(0,10):
    print(df['review'].values[i])
    print('-'*80)

In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70's, they discover the criminal and a net of power and money to cover the murder.<br /><br />"Murder in Greenwich" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich family 

In [16]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [17]:
preprocessor(df['review'].values[9])

'this movie is directed by renny harlin the finnish miracle stallone is gabe walker cat and mouse on the mountains with ruthless terrorists renny harlin knows how to direct actionmovie stallone needed this role to get back on track snowy mountain is very good place for action movie and who is better to direct movie where is snow ice cold and bad weather than finnish man action is good music in the film is spectacular the bad guy is john litghow other stars micheal rooker the portrait of serialkiller janine turner strong medicine the is placed in beautiful place and it is very exciting movie overall good movie remember extreme ääliöt special collectors edition with good extras comig soon in finland straight to video '

In [18]:
df['review'] = df['review'].apply(preprocessor)

In [19]:
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords

porter = PorterStemmer()

nltk.download('stopwords')

stop = stopwords.words('english')

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split


In [21]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None, max_features=1000)

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(solver='liblinear'))])

small_param_grid = {'vect__ngram_range': [(1, 1), (2, 3)],
                     'vect__stop_words': [stop, None],
                     'vect__tokenizer': [tokenizer, tokenizer_porter],
                     'clf__penalty': ['l1','l2'],
                     'clf__C': [1.0, 10.0]}
    
gs_lr_tfidf = GridSearchCV(lr_tfidf, small_param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=3,
                           n_jobs=-1)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df[['review']], df['sentiment'], test_size=0.25,
                                                    random_state=53)
X_train = X_train['review']
X_test = X_test['review']

In [23]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits




GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(lowercase=False,
                                                        max_features=1000)),
                                       ('clf',
                                        LogisticRegression(solver='liblinear'))]),
             n_jobs=-1,
             param_grid={'clf__C': [1.0, 10.0], 'clf__penalty': ['l1', 'l2'],
                         'vect__ngram_range': [(1, 1), (2, 3)],
                         'vect__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                               'our', 'ours', 'ourselves',
                                               'you', "you're", "you've",
                                               "you'll", "you'd", 'your',
                                               'yours', 'yourself',
                                               'yourselves', 'he', 'him', 'his',
                                   

In [24]:
gs_lr_tfidf.best_params_

{'clf__C': 1.0,
 'clf__penalty': 'l1',
 'vect__ngram_range': (1, 1),
 'vect__stop_words': None,
 'vect__tokenizer': <function __main__.tokenizer_porter>}

In [25]:
gs_lr_tfidf.best_score_

0.8717066666666666

In [26]:
lr = LogisticRegression('l1', C=1.0, solver = 'liblinear')
vectorizer = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None, max_features=1000, ngram_range=(1,1),
                             stop_words=None, tokenizer=tokenizer_porter)

lr_model = Pipeline([('vect', vectorizer), ('lr', lr)])
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)

In [29]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87      6232
           1       0.87      0.88      0.88      6268

    accuracy                           0.87     12500
   macro avg       0.87      0.87      0.87     12500
weighted avg       0.87      0.87      0.87     12500



In [38]:
feature_wts = pd.Series(lr.coef_[0,:], vectorizer.get_feature_names_out())
feature_wts.sort_values(ascending=False, inplace=True, key = lambda x:np.abs(x))
feature_wts.head(50)

worst        -18.047730
wast         -15.108444
aw           -14.450483
excel         11.381040
poorli       -10.762677
bore         -10.299208
7             10.092335
terribl       -9.286034
dull          -9.130798
bad           -9.106864
poor          -8.702173
perfect        8.658887
great          8.643317
lame          -8.203307
fail          -8.192961
8              8.106995
disappoint    -7.993324
wors          -7.865706
horribl       -7.853387
hilari         7.825840
superb         7.400235
unless        -7.231151
noth          -7.039854
save          -7.031693
ridicul       -6.854794
brilliant      6.853368
annoy         -6.830216
enjoy          6.782606
mess          -6.621321
lack          -6.396593
avoid         -6.307048
amaz           6.252759
best           6.180539
unfortun      -6.165745
touch          5.803233
fantast        5.666849
favorit        5.653430
today          5.582319
highli         5.577750
perfectli      5.452709
stupid        -5.452686
4             -5

In [18]:
count = CountVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None, max_features=1000)

nb_count = Pipeline([('vect', count),
                     ('nb', MultinomialNB())])

small_param_grid = {'vect__ngram_range': [(1, 1), (2, 3)],
                     'vect__stop_words': [stop, None],
                     'vect__tokenizer': [tokenizer, tokenizer_porter],
                     'nb__alpha': [0.2,0.4,0.8,1.0]}

In [19]:
gs_nb_count = GridSearchCV(nb_count, small_param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=3,
                           n_jobs=-1)

In [21]:
gs_nb_count.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits




GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(lowercase=False,
                                                        max_features=1000)),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': [0.2, 0.4, 0.8, 1.0],
                         'vect__ngram_range': [(1, 1), (2, 3)],
                         'vect__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                               'our', 'ours', 'ourselves',
                                               'you', "you're", "you've",
                                               "you'll", "you'd", 'your',
                                               'yours', 'yourself',
                                               'yourselves', 'he', 'him', 'his',
                                               'himself', 'she', "she's", 'her',
                                    

In [22]:
gs_nb_count.best_params_

{'nb__alpha': 0.8,
 'vect__ngram_range': (1, 1),
 'vect__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down',
  'in',
  'out'

In [45]:
gs_nb_count.best_score_

0.8343466666666666

In [39]:
c_vectorizer = CountVectorizer(strip_accents=None, lowercase=False, preprocessor=None, max_features=1000, ngram_range=(1,1),
                               tokenizer = tokenizer, stop_words = stop)

nb = MultinomialNB(alpha=0.8)

nb_model = Pipeline([('vect', c_vectorizer), ('nb', nb)])

nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)

In [40]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      6232
           1       0.83      0.85      0.84      6268

    accuracy                           0.84     12500
   macro avg       0.84      0.84      0.84     12500
weighted avg       0.84      0.84      0.84     12500



In [41]:
neg_feature_wts = pd.Series(nb.coef_[0,:], c_vectorizer.get_feature_names_out())
neg_feature_wts.sort_values(ascending=False, inplace=True, key = lambda x:np.abs(x))
neg_feature_wts.head(50)



poorly        -9.423960
wasted        -9.261146
lame          -9.236413
waste         -9.180975
mess          -9.044367
trash         -8.998397
fails         -8.966808
badly         -8.954447
dumb          -8.800999
avoid         -8.800999
bored         -8.764782
unless        -8.710394
dull          -8.710394
joke          -8.700815
crap          -8.627320
flat          -8.618502
awful         -8.618502
yeah          -8.588236
zombie        -8.526309
producers     -8.510423
plain         -8.506491
neither       -8.467995
project       -8.464226
90            -8.460471
horrible      -8.449289
ridiculous    -8.441903
annoying      -8.416475
van           -8.395183
filmmakers    -8.381235
spent         -8.374334
cheap         -8.370901
nudity        -8.367480
value         -8.367480
potential     -8.364071
sadly         -8.357286
brain         -8.353911
weak          -8.347195
concept       -8.343854
sorry         -8.337205
remake        -8.327314
biggest       -8.317520
cover         -8

In [23]:
from collections import Counter, OrderedDict

valid_size = int(X_train.shape[0]*0.2)

X_train, X_val = X_train.values[:-valid_size], X_train.values[-valid_size:]
y_train, y_valid = y_train.values[:-valid_size], y_train.values[-valid_size:]

In [24]:
token_counts = Counter()

for text in X_train:
    tokens = tokenizer(text)
    token_counts.update(tokens)

print('Vocab-size:', len(token_counts))    

Vocab-size: 83824


In [25]:
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vocab = vocab(ordered_dict)

vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 463]


In [26]:
class imdb_dataset(torch.utils.data.Dataset):
    def __init__(self, text, label):
        self.text = text
        self.label = label
    def __len__(self):
        return self.label.size
    def __getitem__(self, idx):
        return self.text[idx], self.label[idx]        


In [35]:
train_dataset = imdb_dataset(X_train, y_train)
valid_dataset = imdb_dataset(X_val, y_valid)
test_dataset = imdb_dataset(X_test.values, y_test.values)

In [36]:
device = torch.device("cuda:0")

# device = 'cpu'

text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1. if x == 1 else 0.


def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _text, _label in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), 
                                      dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

In [29]:
from torch.utils.data import DataLoader

dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
text_batch, label_batch, length_batch = next(iter(dataloader))
print(text_batch)
print(label_batch)
print(length_batch)
print(text_batch.shape)

tensor([[   2,  178, 1081,  ...,    0,    0,    0],
        [  11,   20,    7,  ...,    0,    0,    0],
        [  45,   21,   39,  ...,    5, 1268,  182],
        [2538,   41, 3904,  ...,    0,    0,    0]], device='cuda:0')
tensor([1., 1., 1., 1.], device='cuda:0')
tensor([168,  83, 549, 104], device='cuda:0')
torch.Size([4, 549])


In [37]:
batch_size = 32  

train_dl = DataLoader(train_dataset, batch_size=batch_size,
                      shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)

In [31]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embed_dim, 
                                      padding_idx=0) 
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
         
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)

In [32]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
 
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [33]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10 

torch.manual_seed(1)
 
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy: 0.6151 val_accuracy: 0.6711
Epoch 1 accuracy: 0.7227 val_accuracy: 0.6448
Epoch 2 accuracy: 0.7713 val_accuracy: 0.7437
Epoch 3 accuracy: 0.8345 val_accuracy: 0.8328
Epoch 4 accuracy: 0.8447 val_accuracy: 0.8487
Epoch 5 accuracy: 0.9017 val_accuracy: 0.8657
Epoch 6 accuracy: 0.9254 val_accuracy: 0.8769
Epoch 7 accuracy: 0.9386 val_accuracy: 0.8728
Epoch 8 accuracy: 0.9558 val_accuracy: 0.8841
Epoch 9 accuracy: 0.9685 val_accuracy: 0.8847


In [38]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}') 

test_accuracy: 0.8842
