In [None]:
!pip install torchdata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchdata
  Downloading torchdata-0.3.0-py3-none-any.whl (47 kB)
[K     |████████████████████████████████| 47 kB 4.6 MB/s 
[?25hCollecting urllib3>=1.25
  Downloading urllib3-1.26.9-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 27.1 MB/s 
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 61.0 MB/s 
Installing collected packages: urllib3, torchdata
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
Successfu

In [None]:
import torch
import pandas as pd
import torch.nn.functional as F
import torchtext

#1. Data preparation

##Load IMDB Dataset

In [None]:
train_iter, test_iter = torchtext.datasets.IMDB(split = ('train', 'test'))
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [None]:
train_data = [(label, tokenizer(line)) for label, line in train_iter]
train_data.sort(key = lambda x: len(x[1]))

test_data = [(label, tokenizer(line)) for label, line in test_iter]
test_data.sort(key = lambda x: len(x[1]))

##Make Vocab

In [None]:
def make_vocab(train_data, min_freq):
  vocab = {}
  for label, tokenlist in train_data:
    for token in tokenlist:
      if token not in vocab:
        vocab[token] = 0
      vocab[token] += 1
  vocablist = [('<unk>', 0), ('<pad>', 0), ('<cls>', 0), ('<eos>', 0)]
  vocabidx = {}
  for token, freq in vocab.items():
    if freq >= min_freq:
      idx = len(vocablist)
      vocablist.append((token, freq))
      vocabidx[token] = idx
  vocabidx['<unk>'] = 0
  vocabidx['<pad>'] = 1
  vocabidx['<cls>'] = 2
  vocabidx['<eos>'] = 3
  return vocablist, vocabidx

vocablist, vocabidx = make_vocab(train_data, 10)

##Preprocess

In [None]:
def preprocess(data, vocabidx):
  rr = []
  for label, tokenlist in data:
    tkl = ['<cls>']
    for token in tokenlist:
      tkl.append(token if token in vocabidx else "<unk>")
    tkl.append('<eos>')
    rr.append((label, tkl))
  return rr

train_data = preprocess(train_data, vocabidx)
test_data = preprocess(test_data, vocabidx)

##Make Batch & Padding

In [None]:
MODELNAME = 'IMDB_myLSTM.model'
EPOCH = 20
BATCHSIZE = 64
LR = 0.001
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
def make_batch(data, batchsize):
  bb = []
  blabel = []
  btokenlist = []
  for label, tokenlist in data:
    blabel.append(label)
    btokenlist.append(tokenlist)
    if len(blabel) >= batchsize:
      bb.append((btokenlist, blabel))
      blabel = []
      btokenlist = []
  if len(blabel) > 0:
    bb.append((btokenlist, blabel))
  return bb
  
train_data = make_batch(train_data, BATCHSIZE)
test_data = make_batch(test_data, BATCHSIZE)

In [None]:
def padding(bb):
  for tokenlists, labels in bb:
    maxlen = max([len(x) for x in tokenlists])
    for tkl in tokenlists:
      for i in range(maxlen - len(tkl)):
        tkl.append('<pad>')
  return bb

train_data = padding(train_data)
test_data = padding(test_data)

In [None]:
for i in range(5):
    print(train_data[i])

([['<cls>', 'this', 'movie', 'is', 'terrible', 'but', 'it', 'has', 'some', 'good', 'effects', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<cls>', 'i', 'wouldn', "'", 't', 'rent', 'this', 'one', 'even', 'on', 'dollar', 'rental', 'night', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<cls>', 'ming', 'the', 'merciless', 'does', 'a', 'little', '<unk>', 'and', 'a', 'movie', 'most', 'foul', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>

In [None]:
def word2id(bb, vocabidx):
  rr = []
  for tokenlists, labels in bb:
    id_labels = [1 if label == 'pos' else 0 for label in labels]
    id_tokenlists = []
    for tokenlist in tokenlists:
      id_tokenlists.append([vocabidx[token] for token in tokenlist])
    rr.append((id_tokenlists, id_labels))
  return rr

train_data = word2id(train_data, vocabidx)
test_data = word2id(test_data, vocabidx)

In [None]:
import numpy as np
for i in range(10):
    print(train_data[i])

([[2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 15, 16, 17, 18, 19, 4, 20, 21, 22, 23, 24, 25, 14, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 26, 27, 28, 29, 30, 31, 0, 32, 30, 5, 33, 34, 35, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 36, 37, 6, 38, 6, 4, 39, 14, 40, 41, 30, 42, 43, 14, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 44, 17, 45, 46, 47, 48, 49, 17, 50, 21, 51, 44, 52, 53, 9, 14, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 54, 55, 56, 55, 57, 14, 58, 52, 15, 59, 60, 61, 62, 63, 64, 65, 66, 14, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 30, 67, 68, 69, 29, 70, 71, 62, 72, 73, 74, 55, 75, 32, 76, 77, 4, 5, 6, 14, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 4, 6, 27, 78, 5, 79, 68, 80, 14, 81, 82, 83, 55, 8,

#2. LSTM + Dropout Model

In [None]:
class LSTM(torch.nn.Module):
  def __init__(self, input_dim, output_dim, emb_dim, hidden_dim, n_layers, dropout= 0.5):
    super(LSTM, self).__init__()
    self.n_layers = n_layers
    self.hidden_dim = hidden_dim
    self.emb = torch.nn.Embedding(input_dim, emb_dim, padding_idx = vocabidx['<pad>'])
    self.dropout = torch.nn.Dropout(0.25)
    self.rnn = torch.nn.LSTM(emb_dim, hidden_dim, n_layers, dropout =0.5)
    self.fc = torch.nn.Linear(self.hidden_dim, output_dim)

  def forward(self, x):
    e = self.dropout(self.emb(x))
    out, (hidden, cell) = self.rnn(e)
    for i in range(x.size()[0]):
      output = self.fc(out[-1])
    return output

In [None]:
n_layers = 2
input_dim = len(vocablist)
emb_dim = 128
output_dim = 2
hidden_dim = 256

model = LSTM(input_dim, output_dim, emb_dim, hidden_dim, n_layers, dropout= 0.5)
#moving to gpu
model.to(DEVICE)

print(model)

LSTM(
  (emb): Embedding(20439, 128, padding_idx=1)
  (dropout): Dropout(p=0.25, inplace=False)
  (rnn): LSTM(128, 256, num_layers=2, dropout=0.5)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)


#3. Train Model

In [None]:
def train():
  model = LSTM(input_dim, output_dim, emb_dim, hidden_dim, n_layers).to(DEVICE)
  optimizer = torch.optim.Adam(model.parameters(), lr = LR)
  for e in range(EPOCH):
    loss = 0
    for tokenlists, labels in train_data:
      tokenlists = torch.tensor(tokenlists, dtype = torch.int64).transpose(0, 1).to(DEVICE)
      labels = torch.tensor(labels, dtype = torch.int64).to(DEVICE)
      optimizer.zero_grad()
      y = model(tokenlists)
      batchloss = F.cross_entropy(y, labels)
      batchloss.backward()
      optimizer.step()
      loss = loss + batchloss.item()
    print("epoch: {}, loss: {}".format(e,loss))
  torch.save(model.state_dict(), MODELNAME)
train()

epoch: 0, loss: 219.16124733537436
epoch: 1, loss: 193.1016277950257
epoch: 2, loss: 170.79890257492661
epoch: 3, loss: 144.36328473314643
epoch: 4, loss: 112.15216277167201
epoch: 5, loss: 101.40714793652296
epoch: 6, loss: 85.50334669649601
epoch: 7, loss: 79.64312739484012
epoch: 8, loss: 70.48879265785217
epoch: 9, loss: 61.847221137955785


#4. Test Model

In [None]:
def test():
  total = 0
  correct = 0
  model = LSTM(input_dim, output_dim, emb_dim, hidden_dim, n_layers).to(DEVICE)
  model.load_state_dict(torch.load(MODELNAME))
  model.eval()
  for tokenlists, labels in test_data:
    total += len(labels)
    tokenlists = torch.tensor(tokenlists, dtype = torch.int64).transpose(0, 1).to(DEVICE)
    labels = torch.tensor(labels, dtype = torch.int64).to(DEVICE)
    y = model(tokenlists)
    y_pred = y.max(dim = 1)[1]
    correct += (y_pred == labels).sum()
  print("correct: {}".format(correct.item()))
  print("total: {}".format(total))
  print("accuracy: {}".format(correct.item()/total))

test()

correct: 22420
total: 25000
accuracy: 0.8968
