# Assignment 6

Develop RNN model in pytorch to solve the following problem:  
    
1. Detect sarcasm 
Data from https://www.kaggle.com/sherinclaudia/sarcastic-comments-on-reddit  
Your quality metric = accuracy  
Randomly select 20% of your data for test set. You can use it only for final perfomance estimation.   
 

Remember, you can use GPU resourses in kaggle kernels.

In [17]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import nltk
import gensim
import spacy
from tqdm import tqdm_notebook

from sklearn import metrics

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Field, LabelField, BucketIterator, ReversibleField, TabularDataset



SEED = 42
np.random.seed(SEED)

In [9]:
data = pd.read_csv('train-balanced-sarcasm.csv')

In [10]:
data.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [14]:
data.label.value_counts()

1    505413
0    505413
Name: label, dtype: int64

In [11]:
import spacy


spacy_en = spacy.load('en')
spacy_en.remove_pipe('tagger')
spacy_en.remove_pipe('ner')
d
def tokenizer(text): # create a tokenizer function
    return [tok.lemma_ for tok in spacy_en.tokenizer(text) if tok.text.isalpha()]            

In [33]:
classes={
    '1': 1,
    '0': 0
    }

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english')
            )

PARENT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english')
            )

LABEL = LabelField(dtype=tt.int64, use_vocab=True, preprocessing=lambda x: classes[x])

AUTH = Field(sequential=False,
             lower=True
            )

REDD = Field(sequential=False,
             lower=True
            )

SCORE = Field(sequential=False,
             dtype=tt.int
            )

UPS = Field(sequential=False,
             dtype=tt.int
            )

DOWNS = Field(sequential=False,
             dtype=tt.int
            )

dataset = TabularDataset('train-balanced-sarcasm.csv', format='csv', 
                         fields=[('label', LABEL), ('text', TEXT),('author', AUTH),('subreddit', REDD), 
                                 ('score', SCORE), ('ups', UPS), ('downs', DOWNS), (None, None), (None, None), 
                                 ('parent_comment', PARENT)],
                         skip_header=True)

In [34]:
TEXT.build_vocab(dataset, min_freq=10)
len(TEXT.vocab.itos)

21865

In [35]:
TEXT.vocab.itos[:10]

['<unk>',
 '<pad>',
 '<eos>',
 '-pron-',
 'get',
 'like',
 'much',
 'would',
 'well',
 'go']

In [36]:
LABEL.build_vocab(dataset)
AUTH.build_vocab(dataset)
REDD.build_vocab(dataset)
SCORE.build_vocab(dataset)
UPS.build_vocab(dataset)
DOWNS.build_vocab(dataset)
PARENT.build_vocab(dataset)

In [53]:
AUTH.vocab.itos[:10]

['<unk>',
 'biffingston',
 'pokemon_fetish',
 'neondisease',
 'shybidude89',
 'ivsciguy',
 'mad-n-fla',
 'mindlessrabble',
 'canada_girl',
 'chaoslab']

In [37]:
train, test = dataset.split(0.8, stratified=True)
train, valid = train.split(0.7, stratified=True)

In [38]:
np.unique([x.label for x in train.examples], return_counts=True)

(array([0, 1]), array([283031, 283031], dtype=int64))

In [73]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True,
                          )
        
        self.fc = nn.Linear(hidden_size * 2 * 2, 2)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, batch):
        
        x, x_lengths = batch.text
        
        x = self.embedding(x)

        if x_lengths is not None:
            x_lengths = x_lengths.view(-1).tolist()
            x = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True)
            
        _, (hidden, cell) = self.rnn(x)
        
        #hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        #hidden = hidden.contiguous().view(hidden.size(0),-1)        
        hidden = self.dropout(tt.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))        
        cell = cell.contiguous().view(cell.size(0),-1)
        x = tt.cat([hidden, cell], dim=1).squeeze(1)
        x = self.fc(x)
        return x

In [82]:
tt.cuda.empty_cache()

batch_size = 32

device = tt.device('cuda' if tt.cuda.is_available() else 'cpu')

model = MyModel(len(TEXT.vocab.itos),
                embed_size=100,
                hidden_size=128,
               )

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device)

optimizer = optim.Adam(model.parameters())
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
criterion = nn.CrossEntropyLoss()

In [75]:
def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        pred = model(batch)
        loss = criterion(pred, batch.label)
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            pred = model(batch)
            loss = criterion(pred, batch.label)
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches

def _test_final(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    acc_total = 0
    n_batches = len(iterator)
    
    with tt.no_grad():
        for batch in iterator:
            y_pred = tt.nn.functional.softmax(model(batch), dim=1).detach().numpy().argmax(axis=1)
            acc_score = metrics.accuracy_score(y_pred, batch.label)
            acc_total += acc_score

    return acc_total / n_batches


def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, criterion, epoch)
        valid_loss = _test_epoch(model, valid_iterator, criterion)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

#### Embeddings' length 200

In [68]:
nn_train(model, train_iterator, valid_iterator, criterion, optimizer, scheduler=scheduler, 
        n_epochs=10, early_stopping=2)

HBox(children=(IntProgress(value=0, description='epoch 0', max=17690), HTML(value='')))

validation loss 0.58163


HBox(children=(IntProgress(value=0, description='epoch 1', max=17690), HTML(value='')))

validation loss 0.58148


HBox(children=(IntProgress(value=0, description='epoch 2', max=17690), HTML(value='')))

validation loss 0.59589


HBox(children=(IntProgress(value=0, description='epoch 3', max=17690), HTML(value='')))

validation loss 0.62684
Early stopping! best epoch: 1 val 0.58148


In [69]:
test_loss_200 = _test_final(model, test_iterator, criterion)
test_loss_200

0.6814224869780425

#### Embeddings' length 100

In [59]:
test_loss = _test_final(model, test_iterator, criterion)

In [61]:
test_loss 

0.6817615255115255

#### Dropout 0.5

In [76]:
nn_train(model, train_iterator, valid_iterator, criterion, optimizer, scheduler=scheduler, 
        n_epochs=10, early_stopping=2)

HBox(children=(IntProgress(value=0, description='epoch 0', max=17690), HTML(value='')))

validation loss 0.58288


HBox(children=(IntProgress(value=0, description='epoch 1', max=17690), HTML(value='')))

validation loss 0.58026


HBox(children=(IntProgress(value=0, description='epoch 2', max=17690), HTML(value='')))

validation loss 0.58653


HBox(children=(IntProgress(value=0, description='epoch 3', max=17690), HTML(value='')))

validation loss 0.61867
Early stopping! best epoch: 1 val 0.58026


In [77]:
test_loss_drop = _test_final(model, test_iterator, criterion)
test_loss_drop

0.6839472898500676

#### Pretrained embeddings

In [78]:
TEXT.build_vocab(dataset, min_freq=10, vectors="glove.6B.100d")

In [79]:
pretrained_embeddings = TEXT.vocab.vectors

In [83]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.1388,  0.3492, -0.0994,  ...,  0.3838, -0.2139,  0.3234],
        [-0.6151, -0.2424,  0.4952,  ...,  0.2105, -0.0920, -0.5932],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [84]:
nn_train(model, train_iterator, valid_iterator, criterion, optimizer, scheduler=scheduler, 
        n_epochs=10, early_stopping=2)

HBox(children=(IntProgress(value=0, description='epoch 0', max=17690), HTML(value='')))

validation loss 0.57370


HBox(children=(IntProgress(value=0, description='epoch 1', max=17690), HTML(value='')))

validation loss 0.57439


HBox(children=(IntProgress(value=0, description='epoch 2', max=17690), HTML(value='')))

validation loss 0.58116
Early stopping! best epoch: 0 val 0.57370


In [85]:
test_loss_pretrained = _test_final(model, test_iterator, criterion)
test_loss_pretrained

0.6930262381651271

#### Batch size 64

In [86]:
tt.cuda.empty_cache()

batch_size = 64

device = tt.device('cuda' if tt.cuda.is_available() else 'cpu')

model = MyModel(len(TEXT.vocab.itos),
                embed_size=100,
                hidden_size=128,
               )

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device)

optimizer = optim.Adam(model.parameters())
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
criterion = nn.CrossEntropyLoss()

In [87]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.1388,  0.3492, -0.0994,  ...,  0.3838, -0.2139,  0.3234],
        [-0.6151, -0.2424,  0.4952,  ...,  0.2105, -0.0920, -0.5932],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [88]:
nn_train(model, train_iterator, valid_iterator, criterion, optimizer, scheduler=scheduler, 
        n_epochs=10, early_stopping=2)

HBox(children=(IntProgress(value=0, description='epoch 0', max=8845), HTML(value='')))

validation loss 0.57608


HBox(children=(IntProgress(value=0, description='epoch 1', max=8845), HTML(value='')))

validation loss 0.57261


HBox(children=(IntProgress(value=0, description='epoch 2', max=8845), HTML(value='')))

validation loss 0.58056


HBox(children=(IntProgress(value=0, description='epoch 3', max=8845), HTML(value='')))

validation loss 0.60312
Early stopping! best epoch: 1 val 0.57261


In [89]:
test_loss_64 = _test_final(model, test_iterator, criterion)
test_loss_64

0.6887722688262812

#### Minimum frequency 5

In [91]:
TEXT.build_vocab(dataset, min_freq=5, vectors="glove.6B.100d")

In [92]:
pretrained_embeddings = TEXT.vocab.vectors

In [93]:
tt.cuda.empty_cache()

batch_size = 32

device = tt.device('cuda' if tt.cuda.is_available() else 'cpu')

model = MyModel(len(TEXT.vocab.itos),
                embed_size=100,
                hidden_size=128,
               )

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device)

optimizer = optim.Adam(model.parameters())
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
criterion = nn.CrossEntropyLoss()

In [94]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.4624,  0.0663, -0.5151,  ..., -0.3431, -0.1127,  0.1836],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [None]:
nn_train(model, train_iterator, valid_iterator, criterion, optimizer, scheduler=scheduler, 
        n_epochs=10, early_stopping=2)

HBox(children=(IntProgress(value=0, description='epoch 0', max=17690), HTML(value='')))

validation loss 0.57289


HBox(children=(IntProgress(value=0, description='epoch 1', max=17690), HTML(value='')))

validation loss 0.57479


HBox(children=(IntProgress(value=0, description='epoch 2', max=17690), HTML(value='')))

In [None]:
test_loss_mf5 = _test_final(model, test_iterator, criterion)
test_loss_mf5