# LOAD DATA! TRAIN MODEL!

In [1]:
import numpy as np
import pandas as pd
import torch

from functools import reduce
from gensim.models import KeyedVectors
from nltk.tokenize import RegexpTokenizer
from torch import functional as F
from torch import nn
from torch import optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm_notebook as tqdm

In [2]:
raw_train_data = pd.read_csv('data/imdb_reviews/train.csv', header=None)
raw_val_data = pd.read_csv('data/imdb_reviews/val.csv', header=None)
raw_test_data = pd.read_csv('data/imdb_reviews/test.csv', header=None)
raw_val_data

Unnamed: 0,0,1
0,"""MirrorMask"" was a terribly disappointing film...",0
1,I suppose you could say this film has a grain ...,0
2,I think that this was one of the most trite fi...,0
3,"This movie is terrible, it was so difficult to...",0
4,Classic author C.S. Lewis once wrote an essay ...,0
...,...,...
7495,Dogtown and Z-Boys<br /><br />Summary: Dogtown...,1
7496,Possibly the best John Travolta role ever. Sat...,1
7497,The only way to truly understand and relate to...,1
7498,I saw this at the San Francisco Independent Fi...,1


In [3]:
def strip_whitespace(df: pd.DataFrame):
    stripped_df = df.apply(lambda x : x.str.strip() if x.dtype == 'object' else x)
    return stripped_df

In [4]:
raw_train_data = strip_whitespace(raw_train_data)
raw_val_data = strip_whitespace(raw_val_data)
raw_test_data = strip_whitespace(raw_test_data)

In [5]:
embedding_file_path = 'data/pretrained_models/GoogleNews-vectors-negative300.bin'
word2vec = KeyedVectors.\
            load_word2vec_format(embedding_file_path, binary=True)
word2vec['<UNK>'] = np.zeros(300)
word2vec['<PAD>'] = np.zeros(300)

In [6]:
word_to_int = {}
int_to_word = {}
for i, x in enumerate(word2vec.vocab):
    word_to_int[x] = i
    int_to_word[i] = x

In [7]:
def map_word_to_int(s: str):
    if s in word_to_int:
        return word_to_int[s]
    else:
        return word_to_int['<UNK>']

def map_int_to_word(n: int):
    if n in int_to_word:
        return int_to_word[n]
    else:
        return int_to_word[len(int_to_word) - 2]
    
def str_to_int_array(s: str):
    tokenizer = RegexpTokenizer(r'\w+')
    return np.array([map_word_to_int(x) for x in tokenizer.tokenize(s)])

def int_array_to_str(a: np.array):
    return reduce(lambda x, y : '{} {}'.format(x, map_int_to_word(y)), a, '')

In [8]:
class MovieDataset(Dataset):
    
    def __init__(self, features: list, targets: list):
        super(MovieDataset, self).__init__()
        
        self.features = features
        self.targets = targets
        
    def __getitem__(self, idx):
        return torch.tensor(self.features[idx]), torch.tensor(self.targets[idx], 
                                                              dtype=torch.float32)
    
    def __len__(self):
        return len(self.targets)

In [9]:
def refine_raw_data(df: pd.DataFrame):
    
    features = [str_to_int_array(x) for x in df.iloc[:, 0]]
    targets = df.iloc[:, 1].to_numpy().tolist()
    
    return features, targets

def generate_dataset(df: pd.DataFrame):
    features, targets = refine_raw_data(df)
    
    return MovieDataset(features, targets)

In [10]:
train_dataset = generate_dataset(raw_train_data)
val_dataset = generate_dataset(raw_val_data)
test_dataset = generate_dataset(raw_test_data)

In [11]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=8)
val_loader = DataLoader(val_dataset, batch_size=1, num_workers=8)
test_loader = DataLoader(test_dataset, batch_size=1, num_workers=8)

In [22]:
class SentimentAnalysisCNN(nn.Module):
    
    def __init__(self):
        super(SentimentAnalysisCNN, self).__init__()
        
        ### Hyperparameters ###
        self.filter_sizes = [2, 3, 5]
        self.num_filters = 32
        #######################
        
        # Embed the input
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(word2vec.vectors))
        
        # Time to convoulte!
        self.convs = nn.ModuleList([nn.Conv2d(1, 
                                              self.num_filters, 
                                              kernel_size=(h, 300))
                                    for h in self.filter_sizes])
        
        # Feed forward
        fc_input = self.num_filters * len(self.filter_sizes)
        self.fc_layers = nn.Sequential(
                                        nn.Linear(fc_input, 512),
                                        nn.ReLU(),
                                        nn.Dropout(0.3),
                                        nn.Linear(512, 1),
                                        nn.Sigmoid()
                                       )
    
    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)
        conv_outputs = [self.convolute_and_pool(x, c) for c in self.convs]
        x = torch.cat(conv_outputs, 1)
        x = self.fc_layers(x)
        return x
    
    def convolute_and_pool(self, x, conv):
        x = torch.relu(conv(x))
        x = x.squeeze(3)
        return torch.max_pool1d(x, x.size(2)).squeeze(2)

In [29]:
def evaluate_model_metrics(model, validation=False):
    
    gpu_is_available = False
    if torch.cuda.is_available():
        model = model.cuda()
        gpu_is_available = True
    
    if validation:
        eval_loader = val_loader
    else:
        eval_loader = test_loader
    
    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0
    
    for data, targets in eval_loader:
        if gpu_is_available:
            data, targets = data.cuda(), targets.cuda()
        predictions = model.forward(data)
        predictions = torch.round(predictions)
        for (p, t) in zip(predictions, targets.reshape(-1, 1)):
            if p == t:
                if t == 1:
                    true_positives += 1
                else:
                    true_negatives += 1
            else:
                if t == 1:
                    false_positives += 1
                else:
                    false_negatives += 1
    accuracy = (true_positives + true_negatives) / (true_positives
                                                    + true_negatives
                                                    + false_positives
                                                    + false_negatives)
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = 2 * ((precision * recall) / (precision + recall))
    
    return (accuracy, precision, recall, f1)

def evaluate_model(model, validation=False):
    
    gpu_is_available = False
    if torch.cuda.is_available():
        model = model.cuda()
        gpu_is_available = True
    
    if validation:
        eval_loader = val_loader
    else:
        eval_loader = test_loader
    # Evaluation
    model.eval()
    accuracy = 0.
    num_correct = 0
    for features, targets in eval_loader:
        if gpu_is_available:
            features, targets = features.cuda(), targets.cuda()

        output = model.forward(features)
        if round(output.item()) == targets:
            num_correct += 1
    accuracy = (num_correct / len(val_loader.dataset)) * 100.0
    return accuracy

def train_model(model, epochs=5, lr=1e-4):
    
    gpu_is_available = False
    if torch.cuda.is_available():
        model = model.cuda()
        gpu_is_available = True
    
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for e in range(1, epochs+1):
        
        model.train()
        running_loss = 0.
        for features, targets in tqdm(train_loader):
            
            if gpu_is_available:
                features, targets = features.cuda(), targets.cuda()
            
            optimizer.zero_grad()
            output = model.forward(features)
            loss = criterion(output.squeeze(), targets)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        accuracy = evaluate_model(model)
        
       
        
        print('Epoch: {}\t Train Loss: {:.2f}\tValidation Accuracy: {:.2f}%'\
              .format(e,
                      running_loss,
                      accuracy))

In [24]:
model = SentimentAnalysisCNN()
train_model(model, epochs=5, lr=1e-4)

HBox(children=(IntProgress(value=0, max=35000), HTML(value='')))


Epoch: 1	 Train Loss: 11680.64	Validation Accuracy: 88.49%


HBox(children=(IntProgress(value=0, max=35000), HTML(value='')))


Epoch: 2	 Train Loss: 7458.83	Validation Accuracy: 88.04%


HBox(children=(IntProgress(value=0, max=35000), HTML(value='')))


Epoch: 3	 Train Loss: 4867.90	Validation Accuracy: 89.45%


HBox(children=(IntProgress(value=0, max=35000), HTML(value='')))


Epoch: 4	 Train Loss: 2788.15	Validation Accuracy: 89.00%


HBox(children=(IntProgress(value=0, max=35000), HTML(value='')))


Epoch: 5	 Train Loss: 1775.74	Validation Accuracy: 88.85%


In [25]:
print(evaluate_model(model))

88.85333333333332


In [32]:
metrics = evaluate_model_metrics(model)
print('Accuracy: {:.4f}\nPrecision: {:.4f}\nRecall: {:.4f}\nF1 Score: {:.4f}'\
     .format(*metrics))

Accuracy: 0.8885
Precision: 0.8997
Recall: 0.8800
F1 Score: 0.8898
