In [1]:
import os
import cv2
import time
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

IMDB Movie Review Dataset (cleaned)
- Originally from [here](https://ai.stanford.edu/~amaas/data/sentiment/)
- Cleaned into a csv [here](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)

In [2]:
df = pd.read_csv('course_data/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Automatic Tokenization with Spacy

In [3]:
# tool for text
import spacy

# load information about words
!python3 -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[K     |████████████████████████████████| 13.6 MB 3.0 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
some_text = df.iloc[9]['review']
print(some_text)

# automatically tokenize the text
tokenized_text = nlp(some_text)

If you like original gut wrenching laughter you will like this movie. If you are young or old then you will love this movie, hell even my mom liked it.<br /><br />Great Camp!!!


In [5]:
# it's not perfect
for token in tokenized_text:
    print(token.text)

If
you
like
original
gut
wrenching
laughter
you
will
like
this
movie
.
If
you
are
young
or
old
then
you
will
love
this
movie
,
hell
even
my
mom
liked
it.<br
/><br
/>Great
Camp
!
!
!


In [6]:
idx = 5

token = tokenized_text[idx]

# lemmatization
print('Lemmatization of', token.text, 'is', token.lemma_)

# part of speech tagging
print(token.text, 'is a', token.pos_)

# is it a stop word?
print('The fact that', token.text, 'is a stop word is', token.is_stop)

Lemmatization of wrenching is wrench
wrenching is a VERB
The fact that wrenching is a stop word is False


In [7]:
# sentence segmentation
for sentence in tokenized_text.sents:
    print(sentence)

If you like original gut wrenching laughter you will like this movie.
If you are young or old then you will love this movie, hell even my mom liked it.<br /><br />Great Camp!!!


- tons more fancy features!
- Let's do a simple pipeline where we ignore non-alphabetic characters

In [8]:
import re

a_review = df.iloc[9]['review']

# remove those <br />s
a_review = a_review.replace('<br />', ' ')
print(a_review)

# remove non-alphabetic characters
a_review = re.sub("[^A-Za-z']+", ' ', a_review)
print(a_review)

If you like original gut wrenching laughter you will like this movie. If you are young or old then you will love this movie, hell even my mom liked it.  Great Camp!!!
If you like original gut wrenching laughter you will like this movie If you are young or old then you will love this movie hell even my mom liked it Great Camp 


In [9]:
# disabling some fancy features of spacy for speed
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser'])

rows = []
for idx in tqdm(range(len(df))):
    row = df.iloc[idx].copy()
    
    # first we remove numeric characters and lowercase everything
    cleaned_review = re.sub("[^A-Za-z']+", ' ', row['review'].replace('<br />', ' ')).lower()
    
    # we let spaCy tokenize and lemmatize the text for us
    tokenized_review = nlp(cleaned_review)
    cleaned_tokenized = [token.lemma_ for token in tokenized_review if ((not token.is_stop) or (' ' in token.text))]
    
    if len(cleaned_tokenized) > 1:
        row['cleaned'] = ' '.join(cleaned_tokenized)
    rows.append(row)
df_clean = pd.DataFrame(rows)
df_clean.head()
df_clean.to_csv('course_data/IMDB_cleaned.csv')

  0%|          | 0/50000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [10]:
df_clean = pd.read_csv('course_data/IMDB_cleaned.csv')
df_clean.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment,cleaned
0,0,One of the other reviewers has mentioned that ...,positive,reviewer mention watch oz episode hook right e...
1,1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,2,I thought this was a wonderful way to spend ti...,positive,think wonderful way spend time hot summer week...
3,3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...
4,4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...


In [14]:
print(df_clean.iloc[9]['review'])
print('\n')
df_clean.iloc[9]['cleaned']

If you like original gut wrenching laughter you will like this movie. If you are young or old then you will love this movie, hell even my mom liked it.<br /><br />Great Camp!!!




'like original gut wrench laughter like movie young old love movie hell mom like great camp'

## Prepare for Training

In [15]:
# count words, send infrequent to unknown

# let's get an idea of word frequency
from collections import Counter

reviews = [review.split(' ') for review in list(df_clean['cleaned'])]
word_freq = Counter([token for review in reviews for token in review]).most_common()

In [16]:
# no surprises here
word_freq[:10]

[('movie', 103272),
 ('film', 95394),
 ('like', 44224),
 ('good', 41452),
 ("'", 34217),
 ('time', 31502),
 ('character', 28356),
 ('watch', 27821),
 ('bad', 26610),
 ('story', 25272)]

In [17]:
# words only seen once
word_freq[-25:]

[('softheade', 1),
 ('macrophile', 1),
 ('arilin', 1),
 ('iraquis', 1),
 ('yaaayyyy', 1),
 ('ballplaying', 1),
 ('skynet', 1),
 ('homunculi', 1),
 ('eucharist', 1),
 ('escadrille', 1),
 ('yeeeecchh', 1),
 ('demono', 1),
 ('rohna', 1),
 ('pintos', 1),
 ('mottos', 1),
 ('kman', 1),
 ('farrellys', 1),
 ('nolin', 1),
 ('angelyne', 1),
 ('censorial', 1),
 ('burtolucci', 1),
 ('jossi', 1),
 ('ashknenazi', 1),
 ('effortful', 1),
 ('clatter', 1)]

In [18]:
# remove words that appear infrequently
word_freq = dict(word_freq)
print(len(word_freq))
min_freq = 5
word_dict = {}

# sending all the unknowns to 0
i = 1
for word in word_freq:
    if word_freq[word] > min_freq:
        word_dict[word] = i
        i += 1
    else:
        word_dict[word] = 0

# dictionary length        
dict_length = max(word_dict.values()) + 1
dict_length

84642


28510

In [19]:
# to collate the tensors into batches, sentence need to be the same size
# we could overwrite the collate function, or we could pick a max sentence size and pad

max_length = 0
for idx in tqdm(range(len(df_clean))):
    row = df_clean.iloc[idx]
    length = len(row['cleaned'].split(' '))
    if length > max_length:
        max_length = length
print(max_length)

  0%|          | 0/50000 [00:00<?, ?it/s]

1296


In [20]:
class IMDBDataset(Dataset):
    def __init__(self, df, word_dict, max_length):
        self.df = df
        self.word_dict = word_dict
        self.sent_dict = {'negative': 0, 'positive': 1}
        self.max_len = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        review = row['cleaned'].split(' ')
        x = torch.zeros(self.max_len)
        
        # get review as a list of integers
        for idx in range(len(review)):
            
            # we want to front pad for RNN
            x[self.max_len - len(review) + idx] = self.word_dict[review[idx]]
            
        y = torch.tensor(self.sent_dict[row['sentiment']]).float()
        
        # embedding likes long tensors
        return x.long(), y
ds = IMDBDataset(df_clean, word_dict, max_length)
next(iter(ds))

(tensor([  0,   0,   0,  ...,  56, 437, 271]), tensor(1.))

## Models

In [21]:
# CBOW model for sentiment analysis
# train the embedding during training
class CBOW(nn.Module):
    def __init__(self, dict_length, embedding_size):
        super(CBOW, self).__init__()
        # padding index turns off gradient for unknown tokens
        self.word_emb = nn.Embedding(dict_length, embedding_size, padding_idx=0)
        self.linear = nn.Linear(embedding_size, 1)
        self.emb_size = embedding_size
        
    def forward(self, x):
        sent_length = x.shape[1]
        x = self.word_emb(x)
        sent_length = torch.count_nonzero(x, dim=1)
        x = torch.sum(x, dim=1) / sent_length
        x = self.linear(x)
        return torch.squeeze(x)

In [22]:
dl = DataLoader(ds, batch_size=1000, shuffle=True)
x, y = next(iter(dl))

cbow_model = CBOW(dict_length, 100)
cbow_model(x).shape

torch.Size([1000])

In [23]:
def one_pass(model, dataloader, optimizer, lossFun, backwards=True, print_loss=False):
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    for x, y in tqdm(dataloader):
        
        y_pred = model(x)
        loss = lossFun(y_pred, y)
        total_loss += loss.item()
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    avg_loss = total_loss / len(dataloader)
    
    if print_loss == True:
        print(avg_loss)
    
    return avg_loss

def one_pass_acc(model, dataloader, num_points):
    model.eval()
    total_incorrect = 0
        
    for x, y in dataloader:
        y_pred = (torch.sigmoid(model(x)) > 0.5).float()
        
        total_incorrect += torch.count_nonzero(y - y_pred).item()
        
    percent_wrong = total_incorrect / num_points
    return 1 - percent_wrong

In [24]:
lossFun = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(cbow_model.parameters(), lr = 0.01)

num_epochs = 5

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)
    
    loss = one_pass(cbow_model, dl, optimizer, lossFun)
    print('Loss: ', loss)
    
    acc = one_pass_acc(cbow_model, dl, len(ds))
    print('Accuracy: ', acc)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch:  0


  0%|          | 0/50 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [25]:
# RNN model for sentiment analysis (read Documentation for nn.RNN!)
# train the embedding during training
class RNN(nn.Module):
    def __init__(self, dict_length, embedding_size):
        super(RNN, self).__init__()
        # padding index turns off gradient for unknown tokens
        self.word_emb = nn.Embedding(dict_length, embedding_size, padding_idx=0)
        
        # RNN doesn't care about length of sequence
        # RNN does care about the size of the word embedding
        # hidden size dictates dimension of output of RNN
        self.rnn = nn.RNN(input_size=embedding_size, hidden_size=1, batch_first=True)
        
        # PyTorch RNN outputs a sequence of same length as input
        # For many to one, we can either use the final hidden state OR
        # slap a linear layer on the output, taking in all the hidden states
        
    def forward(self, x):
        x = self.word_emb(x)
        
        # RNN layer outputs a tuple, the output and the final hidden state
        # taking the final hidden state as output
        x = self.rnn(x)[1]
    
        return torch.squeeze(x)

x, y = next(iter(dl))
rnn_model = RNN(dict_length, 100)
rnn_model(x).shape

torch.Size([1000])

In [41]:
rnn_model

RNN(
  (word_emb): Embedding(28510, 100, padding_idx=0)
  (rnn): RNN(100, 1, batch_first=True)
)

In [None]:
# does way better
# hidden state updates each time it sees a new word
# intuition: probably gets excited when it sees a word like bad/good and ignores the rest
lossFun = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.01)

num_epochs = 5

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)
    
    loss = one_pass(model, dl, optimizer, lossFun)
    print('Loss: ', loss)
    
    acc = one_pass_acc(model, dl, len(ds))
    print('Accuracy: ', acc)

## Tools for Word Embeddings

In [26]:
# gensim is a great package for word embeddings
# easy to train your own!
import gensim.downloader

# twitter embedding might be helpful for doing NLP related to social media!
print(list(gensim.downloader.info()['models'].keys()))



['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [27]:
# let's get the glove-wiki-gigaword-100
# you can freeze the embedding for a model, finetune the embedding, or use it as a starting point for an embedding layer
glove_emb = gensim.downloader.load('glove-wiki-gigaword-100')

In [28]:
# you can easily perform all the fancy features of word embeddings
glove_emb.most_similar('cat')

[('dog', 0.8798074126243591),
 ('rabbit', 0.7424427270889282),
 ('cats', 0.732300341129303),
 ('monkey', 0.7288709878921509),
 ('pet', 0.7190139889717102),
 ('dogs', 0.7163873314857483),
 ('mouse', 0.6915250420570374),
 ('puppy', 0.6800068616867065),
 ('rat', 0.6641027331352234),
 ('spider', 0.6501135230064392)]

In [29]:
# normed cat vector
cat_vec = glove_emb.get_vector('cat')
cat_vec / np.linalg.norm(cat_vec)

array([ 0.04581573,  0.05612466,  0.12537412, -0.11789493, -0.1162836 ,
        0.12552294,  0.04842322, -0.02799585,  0.0120681 , -0.15672757,
       -0.05774988,  0.02835106,  0.1434202 ,  0.04053723,  0.02792045,
        0.19597296,  0.10424626,  0.01933913,  0.17506339,  0.10164274,
        0.07978065,  0.04200767, -0.00260134, -0.14211448,  0.10990973,
        0.22725299, -0.17471413, -0.09964842, -0.045272  ,  0.00473973,
        0.02127272,  0.01661713,  0.10917152,  0.11604548,  0.15044893,
        0.09069879, -0.05556507,  0.05005638,  0.13685384, -0.12099257,
        0.0388505 ,  0.00877281, -0.06178614, -0.13657802, -0.04508745,
        0.09164931, -0.15311994,  0.02025671,  0.11040384,  0.0133782 ,
       -0.11352132,  0.04709963,  0.09360395,  0.16423851, -0.05806937,
       -0.2663456 , -0.01970048,  0.05583891,  0.0825588 ,  0.02100086,
        0.12343537,  0.17759547, -0.04652614,  0.10189674,  0.19720726,
        0.23507151, -0.03247265,  0.04098373,  0.14655556,  0.04

In [30]:
# get the weights
weights = glove_emb.get_normed_vectors()
weights.shape

(400000, 100)

In [31]:
# PyTorch makes it easy to load the weights
glove_emb_layer = nn.Embedding.from_pretrained(torch.tensor(weights))

In [32]:
cat_idx = glove_emb.get_index('cat')
glove_emb_layer(torch.tensor(cat_idx))

tensor([ 0.0458,  0.0561,  0.1254, -0.1179, -0.1163,  0.1255,  0.0484, -0.0280,
         0.0121, -0.1567, -0.0577,  0.0284,  0.1434,  0.0405,  0.0279,  0.1960,
         0.1042,  0.0193,  0.1751,  0.1016,  0.0798,  0.0420, -0.0026, -0.1421,
         0.1099,  0.2273, -0.1747, -0.0996, -0.0453,  0.0047,  0.0213,  0.0166,
         0.1092,  0.1160,  0.1504,  0.0907, -0.0556,  0.0501,  0.1369, -0.1210,
         0.0389,  0.0088, -0.0618, -0.1366, -0.0451,  0.0916, -0.1531,  0.0203,
         0.1104,  0.0134, -0.1135,  0.0471,  0.0936,  0.1642, -0.0581, -0.2663,
        -0.0197,  0.0558,  0.0826,  0.0210,  0.1234,  0.1776, -0.0465,  0.1019,
         0.1972,  0.2351, -0.0325,  0.0410,  0.1466,  0.0477, -0.1914,  0.0268,
        -0.0014,  0.0655, -0.0245,  0.0540, -0.0813,  0.0043, -0.1204,  0.0809,
         0.0388, -0.0830,  0.0370, -0.0065, -0.1559, -0.0275,  0.0087, -0.0168,
         0.0097,  0.0478,  0.0898, -0.0371,  0.0916,  0.0177, -0.0361, -0.0030,
        -0.1462, -0.0288,  0.0300, -0.14

In [33]:
# make sure you turn off the gradients when training!
for param in glove_emb_layer.parameters():
    print(param.requires_grad)

False


In [34]:
glove_emb_layer = nn.Embedding.from_pretrained(torch.tensor(weights), freeze=True)
for param in glove_emb_layer.parameters():
    print(param.requires_grad)

False


In [None]:
# define a word2vec model
from gensim.models import Word2Vec

# different options for how to perform word2vec training
# check out documentation for more options related to sampling frequent vs. infrequent words
w2v_model = Word2Vec(# only consider words that show up at least a 100 times
                     min_count = 100, 
                     
                     # context window
                     window = 2,
                     
                     #size of embedding
                     vector_size = 300)
# has methods build_vocab and train