In [22]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import os
import random
import time
import string

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
import torch.nn as nn
from torchtext import data
import torch.optim as optim
from torchtext.vocab import Vectors
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv
/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip
/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip


In [30]:
!unzip '/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip'
!unzip '/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip'

Archive:  /kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip
  inflating: train.tsv               
Archive:  /kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip
  inflating: test.tsv                


In [33]:
import torch
import random
import pickle
from tqdm import tqdm
from collections import Counter
from torch.utils.data import DataLoader,Dataset
import torch.nn as nn
import torch.nn.functional as F

In [31]:
train = pd.read_csv('train.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')

In [34]:
def Corpus_Extr(df):
    print('Construct Corpus...')
    corpus = []
    for i in tqdm(range(len(df))):
        corpus.append(df.Phrase[i].lower().split())
    corpus = Counter(np.hstack(corpus))
    corpus = corpus
    corpus2 = sorted(corpus,key=corpus.get,reverse=True)
    print('Convert Corpus to Integers')
    vocab_to_int = {word: idx for idx,word in enumerate(corpus2,1)}
    print('Convert Phrase to Integers')
    phrase_to_int = []
    for i in tqdm(range(len(df))):
        phrase_to_int.append([vocab_to_int[word] for word in df.Phrase.values[i].lower().split()])
    return corpus,vocab_to_int,phrase_to_int
corpus,vocab_to_int,phrase_to_int = Corpus_Extr(train)

  3%|▎         | 4259/156060 [00:00<00:03, 42588.52it/s]

Construct Corpus...


100%|██████████| 156060/156060 [00:03<00:00, 41909.87it/s]
  5%|▌         | 8422/156060 [00:00<00:01, 82957.41it/s]

Convert Corpus to Integers
Convert Phrase to Integers


100%|██████████| 156060/156060 [00:02<00:00, 74229.74it/s]


In [35]:
def Pad_sequences(phrase_to_int,seq_length):
    pad_sequences = np.zeros((len(phrase_to_int), seq_length),dtype=int)
    for idx,row in tqdm(enumerate(phrase_to_int),total=len(phrase_to_int)):
        pad_sequences[idx, :len(row)] = np.array(row)[:seq_length]
    return pad_sequences

In [36]:
pad_sequences = Pad_sequences(phrase_to_int,30)

100%|██████████| 156060/156060 [00:00<00:00, 163022.39it/s]


In [37]:
train.sample(50)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
96584,96585,5045,"that Skins comes as a welcome , if downbeat , ...",3
87200,87201,4523,The Rock 's fighting skills,2
58251,58252,2935,rolling over in their graves,0
6661,6662,265,"fact , even better",3
15383,15384,658,sympathy,2
92337,92338,4806,for violence,1
78631,78632,4047,sustained fest of self-congratulation between ...,0
125370,125371,6737,characteristically complex Tom Clancy thriller,3
120529,120530,6445,truly edgy -- merely crassly flamboyant,2
43199,43200,2087,"understands , in a way that speaks forcefully ...",2


In [38]:
class PhraseDataset(Dataset):
    def __init__(self,df,pad_sequences):
        super().__init__()
        self.df = df
        self.pad_sequences = pad_sequences
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        if 'Sentiment' in self.df.columns:
            label = self.df['Sentiment'].values[idx]
            item = self.pad_sequences[idx]
            return item,label
        else:
            item = self.pad_sequences[idx]
            return item

In [39]:
class SentimentRNN(nn.Module):
    
    def __init__(self,corpus_size,output_size,embedd_dim,hidden_dim,n_layers):
        super().__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(corpus_size,embedd_dim)
        self.lstm = nn.LSTM(embedd_dim, hidden_dim,n_layers,dropout=0.5, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim,output_size)
        self.act = nn.Sigmoid()
        
    def forward(self,x,hidden):
        batch_size = x.size(0)
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds,hidden)
        lstm_out = lstm_out.contiguous().view(-1,self.hidden_dim)
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.act(out)
        out = out.view(batch_size,-1)
        out = out[:,-5:]
        return out, hidden
    def init_hidden(self,batch_size):
        
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                   weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden

In [40]:
vocab_size = len(vocab_to_int)
output_size = 5
embedding_dim = 400
hidden_dim = 256
n_layers = 2
net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim,n_layers)

In [41]:
net.train()
clip=5
epochs = 200
counter = 0
print_every = 100
lr=0.01

def criterion(input, target, size_average=True):
    """Categorical cross-entropy with logits input and one-hot target"""
    l = -(target * torch.log(F.softmax(input, dim=1) + 1e-10)).sum(1)
    if size_average:
        l = l.mean()
    else:
        l = l.sum()
    return l

optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [42]:
import gc

In [43]:
batch_size=32
losses = []
accs=[]
for e in range(epochs):
    a = np.random.choice(len(train)-1, 1000)
    train_set = PhraseDataset(train.loc[train.index.isin(np.sort(a))],pad_sequences[a])
    train_loader = DataLoader(train_set,batch_size=32,shuffle=True)
    h = net.init_hidden(32)
    running_loss = 0.0
    running_acc = 0.0

    for idx,(inputs, labels) in enumerate(train_loader):
        counter += 1
        gc.collect()
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        optimizer.zero_grad()
        if inputs.shape[0] != batch_size:
            break
        output, h = net(inputs, h)
        labels=torch.nn.functional.one_hot(labels, num_classes=5)
        loss = criterion(output, labels)
        loss.backward()
        running_loss += loss.cpu().detach().numpy()
        running_acc += (output.argmax(dim=1) == labels.argmax(dim=1)).float().mean()
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        if idx%20 == 0:
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format((running_loss/(idx+1))))
            losses.append(float(running_loss/(idx+1)))
            print(f'acc:{running_acc/(idx+1)}')
            accs.append(running_acc/(idx+1))

Epoch: 1/200... Step: 1... Loss: 1.615720...
acc:0.0625
Epoch: 1/200... Step: 21... Loss: 1.393763...
acc:0.5059523582458496
Epoch: 2/200... Step: 33... Loss: 1.185964...
acc:0.71875
Epoch: 2/200... Step: 53... Loss: 1.372548...
acc:0.53125
Epoch: 3/200... Step: 65... Loss: 1.395137...
acc:0.5
Epoch: 3/200... Step: 85... Loss: 1.377932...
acc:0.5252976417541504
Epoch: 4/200... Step: 97... Loss: 1.375176...
acc:0.53125
Epoch: 4/200... Step: 117... Loss: 1.376256...
acc:0.5282738208770752
Epoch: 5/200... Step: 129... Loss: 1.435468...
acc:0.46875
Epoch: 5/200... Step: 149... Loss: 1.414878...
acc:0.4895833432674408
Epoch: 6/200... Step: 161... Loss: 1.465887...
acc:0.4375
Epoch: 6/200... Step: 181... Loss: 1.385069...
acc:0.5148809552192688
Epoch: 7/200... Step: 192... Loss: 1.399219...
acc:0.53125
Epoch: 7/200... Step: 212... Loss: 1.391453...
acc:0.511904776096344
Epoch: 8/200... Step: 224... Loss: 1.284347...
acc:0.5625
Epoch: 8/200... Step: 244... Loss: 1.400521...
acc:0.495535701513

Epoch: 65/200... Step: 2046... Loss: 1.431434...
acc:0.46875
Epoch: 65/200... Step: 2066... Loss: 1.410873...
acc:0.4955357015132904
Epoch: 66/200... Step: 2078... Loss: 1.404484...
acc:0.5
Epoch: 66/200... Step: 2098... Loss: 1.392849...
acc:0.511904776096344
Epoch: 67/200... Step: 2110... Loss: 1.405183...
acc:0.5
Epoch: 67/200... Step: 2130... Loss: 1.425551...
acc:0.4791666567325592
Epoch: 68/200... Step: 2142... Loss: 1.217843...
acc:0.6875
Epoch: 68/200... Step: 2162... Loss: 1.351954...
acc:0.5520833134651184
Epoch: 69/200... Step: 2174... Loss: 1.442985...
acc:0.4375
Epoch: 69/200... Step: 2194... Loss: 1.435093...
acc:0.4613095223903656
Epoch: 70/200... Step: 2206... Loss: 1.371393...
acc:0.53125
Epoch: 70/200... Step: 2226... Loss: 1.398099...
acc:0.5044642686843872
Epoch: 71/200... Step: 2238... Loss: 1.488005...
acc:0.40625
Epoch: 71/200... Step: 2258... Loss: 1.360670...
acc:0.5372023582458496
Epoch: 72/200... Step: 2270... Loss: 1.424821...
acc:0.46875
Epoch: 72/200... St

Epoch: 127/200... Step: 4029... Loss: 1.467637...
acc:0.4375
Epoch: 127/200... Step: 4049... Loss: 1.391148...
acc:0.5133928656578064
Epoch: 128/200... Step: 4061... Loss: 1.279876...
acc:0.625
Epoch: 128/200... Step: 4081... Loss: 1.413608...
acc:0.4910714328289032
Epoch: 129/200... Step: 4093... Loss: 1.311272...
acc:0.59375
Epoch: 129/200... Step: 4113... Loss: 1.366159...
acc:0.538690447807312
Epoch: 130/200... Step: 4125... Loss: 1.310872...
acc:0.59375
Epoch: 130/200... Step: 4145... Loss: 1.398157...
acc:0.5059523582458496
Epoch: 131/200... Step: 4157... Loss: 1.373784...
acc:0.53125
Epoch: 131/200... Step: 4177... Loss: 1.399990...
acc:0.5029761791229248
Epoch: 132/200... Step: 4189... Loss: 1.469207...
acc:0.4375
Epoch: 132/200... Step: 4209... Loss: 1.370046...
acc:0.5297619104385376
Epoch: 133/200... Step: 4221... Loss: 1.368869...
acc:0.5
Epoch: 133/200... Step: 4241... Loss: 1.365590...
acc:0.5342261791229248
Epoch: 134/200... Step: 4253... Loss: 1.342473...
acc:0.5625
Epo

Epoch: 188/200... Step: 6001... Loss: 1.401396...
acc:0.5014880895614624
Epoch: 189/200... Step: 6013... Loss: 1.471490...
acc:0.40625
Epoch: 189/200... Step: 6033... Loss: 1.400318...
acc:0.507440447807312
Epoch: 190/200... Step: 6045... Loss: 1.092496...
acc:0.8125
Epoch: 190/200... Step: 6065... Loss: 1.379598...
acc:0.5327380895614624
Epoch: 191/200... Step: 6077... Loss: 1.403026...
acc:0.5
Epoch: 191/200... Step: 6097... Loss: 1.383033...
acc:0.519345223903656
Epoch: 192/200... Step: 6109... Loss: 1.329665...
acc:0.5625
Epoch: 192/200... Step: 6129... Loss: 1.392921...
acc:0.5044642686843872
Epoch: 193/200... Step: 6141... Loss: 1.379810...
acc:0.5
Epoch: 193/200... Step: 6161... Loss: 1.393473...
acc:0.507440447807312
Epoch: 194/200... Step: 6173... Loss: 1.383736...
acc:0.53125
Epoch: 194/200... Step: 6193... Loss: 1.394698...
acc:0.5104166865348816
Epoch: 195/200... Step: 6205... Loss: 1.436604...
acc:0.46875
Epoch: 195/200... Step: 6225... Loss: 1.413411...
acc:0.491071432828