In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['sarcasm', 'sarcastic-comments-on-reddit']


In [2]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import nltk
import gensim
import spacy
from tqdm import tqdm_notebook

from sklearn import metrics

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Field, LabelField, BucketIterator, ReversibleField, TabularDataset, Iterator



SEED = 42
np.random.seed(SEED)

In [3]:
df = pd.read_csv('../input/sarcastic-comments-on-reddit/train-balanced-sarcasm.csv')

In [4]:
#df["comment"] = df["comment"] + ' ' + df["parent_comment"] + ' ' + df['subreddit']

In [5]:
df = df[['label', 'comment']]

In [6]:
df.head()

Unnamed: 0,label,comment
0,0,NC and NH.
1,0,You do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G..."
3,0,"This meme isn't funny none of the ""new york ni..."
4,0,I could use one of those tools.


In [7]:
df.to_csv('my_data.csv', index=False)

In [8]:
#df = pd.read_csv('my_data.csv')
#df.head()

In [9]:
import spacy


spacy_en = spacy.load('en')
spacy_en.remove_pipe('tagger')
spacy_en.remove_pipe('ner')

def tokenizer(text): # create a tokenizer function
    return [tok.lemma_ for tok in spacy_en.tokenizer(text)]

In [10]:
classes={
    '0':0,
    '1':1
}

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english')
            )
LABEL = LabelField(dtype=tt.int64, use_vocab=True, preprocessing=lambda x: classes[x])

dataset = TabularDataset('my_data.csv', format='csv', 
                         fields=[('label', LABEL),('comment', TEXT)], 
                         skip_header=True)

In [11]:
TEXT.build_vocab(dataset, min_freq=5, vectors="glove.6B.300d")
# TEXT.build_vocab(dataset, min_freq=5)
len(TEXT.vocab.itos)

.vector_cache/glove.6B.zip: 862MB [01:26, 9.95MB/s]                               
100%|█████████▉| 399634/400000 [00:51<00:00, 7843.11it/s]

35493

In [12]:
TEXT.vocab.itos[:10]

['<unk>', '<pad>', '<eos>', '.', ',', '-pron-', '?', '!', '"', '...']

In [13]:
LABEL.build_vocab(dataset)

In [14]:
train, test = dataset.split(0.8, stratified=True)
train, valid = train.split(0.9, stratified=True)

In [15]:
print(np.unique([x.label for x in train.examples], return_counts=True))
print(np.unique([x.label for x in valid.examples], return_counts=True))
print(np.unique([x.label for x in test.examples], return_counts=True))

(array([0, 1]), array([363897, 363897]))
(array([0, 1]), array([40433, 40433]))
(array([0, 1]), array([101083, 101083]))


In [16]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, embedding, dropout=0):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding, freeze=True)
        
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True,
                           dropout = dropout
                          )
        
        self.fc = nn.Linear(hidden_size * 2 * 2, 2)
        
    def forward(self, batch):
        
        x, x_lengths = batch.comment
        
        x = self.embedding(x)

        if x_lengths is not None:
            x_lengths = x_lengths.view(-1).tolist()
            x = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True)
            
        _, (hidden, cell) = self.rnn(x)
        
        hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        x = tt.cat([hidden, cell], dim=1).squeeze(1)
        x = self.fc(x)
        return x

In [17]:
# tt.cuda.empty_cache()

batch_size = 64
pretrained_embedding = TEXT.vocab.vectors
model = MyModel(len(TEXT.vocab.itos),
                embed_size=300,
                hidden_size=128, 
                embedding=pretrained_embedding
               )

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.comment),
    sort_within_batch=True,
)

optimizer = optim.Adam(model.parameters())
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
criterion = nn.CrossEntropyLoss()

In [18]:
def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        pred = model(batch)
        loss = criterion(pred, batch.label)
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator, criterion, scheduler):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            pred = model(batch)
            loss = criterion(pred, batch.label)
            epoch_loss += loss.data.item()
    scheduler.step(epoch_loss)
    
    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100, 
             scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, criterion, epoch)
        valid_loss = _test_epoch(model, valid_iterator, criterion, scheduler)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

In [19]:
nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=50, 
         scheduler=scheduler, early_stopping=2)

HBox(children=(IntProgress(value=0, description='epoch 0', max=11372, style=ProgressStyle(description_width='i…

100%|█████████▉| 399634/400000 [01:10<00:00, 7843.11it/s]

validation loss 0.55972


HBox(children=(IntProgress(value=0, description='epoch 1', max=11372, style=ProgressStyle(description_width='i…

validation loss 0.55112


HBox(children=(IntProgress(value=0, description='epoch 2', max=11372, style=ProgressStyle(description_width='i…

validation loss 0.55390


HBox(children=(IntProgress(value=0, description='epoch 3', max=11372, style=ProgressStyle(description_width='i…

validation loss 0.55503
Early stopping! best epoch: 1 val 0.55112


In [20]:
from sklearn.metrics import accuracy_score

In [21]:
def evaluate(model, test_iterator, criterion):
    model.eval()
    epoch_acc = 0

    n_batches = len(test_iterator)
    with tt.no_grad():
        for batch in test_iterator:
            pred = model(batch)
            pred = tt.softmax(pred, dim=-1)
            pred = pred.detach().numpy()
            acc = accuracy_score(batch.label, pred.argmax(axis=1))
            epoch_acc += acc.item()

    return epoch_acc / n_batches

In [22]:
acc = evaluate(model, test_iterator, criterion)

In [23]:
print('Accuracy on test sample:', acc)

Accuracy on test sample: 0.711720920825859
