In [1]:
!pip install torchdata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchdata
  Downloading torchdata-0.3.0-py3-none-any.whl (47 kB)
[K     |████████████████████████████████| 47 kB 2.9 MB/s 
Collecting urllib3>=1.25
  Downloading urllib3-1.26.9-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 9.2 MB/s 
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 65.0 MB/s 
Installing collected packages: urllib3, torchdata
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
Successfully ins

In [2]:
import torch
import torch.nn.functional as F
import torchtext

train_iter,test_iter=torchtext.datasets.IMDB(split=('train','test'))
tokenizer=torchtext.data.utils.get_tokenizer('basic_english')


In [3]:
MODELNAME='imdb-rnn.model'
EPOCH=10
BATCHSIZE=64
LR=1e-5
DEVICE="cuda" if torch.cuda.is_available() else "cpu"

In [4]:
train_data = [(label,tokenizer(line)) for label,line in train_iter]
train_data.sort(key=lambda x:len(x[1]))
test_data = [(label,tokenizer(line)) for label,line in test_iter]
test_data.sort(key=lambda x:len(x[1]))

for i in range(10):
  print(train_data[i])

('neg', ['this', 'movie', 'is', 'terrible', 'but', 'it', 'has', 'some', 'good', 'effects', '.'])
('neg', ['i', 'wouldn', "'", 't', 'rent', 'this', 'one', 'even', 'on', 'dollar', 'rental', 'night', '.'])
('neg', ['ming', 'the', 'merciless', 'does', 'a', 'little', 'bardwork', 'and', 'a', 'movie', 'most', 'foul', '!'])
('pos', ['adrian', 'pasdar', 'is', 'excellent', 'is', 'this', 'film', '.', 'he', 'makes', 'a', 'fascinating', 'woman', '.'])
('neg', ['you', "'", 'd', 'better', 'choose', 'paul', 'verhoeven', "'", 's', 'even', 'if', 'you', 'have', 'watched', 'it', '.'])
('neg', ['long', ',', 'boring', ',', 'blasphemous', '.', 'never', 'have', 'i', 'been', 'so', 'glad', 'to', 'see', 'ending', 'credits', 'roll', '.'])
('neg', ['a', 'rating', 'of', '1', 'does', 'not', 'begin', 'to', 'express', 'how', 'dull', ',', 'depressing', 'and', 'relentlessly', 'bad', 'this', 'movie', 'is', '.'])
('pos', ['this', 'is', 'the', 'definitive', 'movie', 'version', 'of', 'hamlet', '.', 'branagh', 'cuts', 'nothi

In [5]:
def make_vocab(train_data,min_freq):
  vocab={}
  for label, tokenlist in train_data:
    for token in tokenlist:
      if token not in vocab:
        vocab[token]=0
      vocab[token]+=1
  vocablist = [('<unk>',0),('<pad>',0),('<cls>',0),('<eos>',0)]
  vocabidx = {}
  for token,freq in vocab.items():
    if freq >= min_freq:
      idx = len(vocablist)
      vocablist.append((token,freq))
      vocabidx[token]=idx
  vocabidx['<unk>']=0
  vocabidx['<pad>']=1
  vocabidx['<cls>']=2
  vocabidx['<eos>']=3
  return vocablist,vocabidx

vocablist, vocabidx = make_vocab(train_data,10)
for i in range(10):
  print(train_data[i])

('neg', ['this', 'movie', 'is', 'terrible', 'but', 'it', 'has', 'some', 'good', 'effects', '.'])
('neg', ['i', 'wouldn', "'", 't', 'rent', 'this', 'one', 'even', 'on', 'dollar', 'rental', 'night', '.'])
('neg', ['ming', 'the', 'merciless', 'does', 'a', 'little', 'bardwork', 'and', 'a', 'movie', 'most', 'foul', '!'])
('pos', ['adrian', 'pasdar', 'is', 'excellent', 'is', 'this', 'film', '.', 'he', 'makes', 'a', 'fascinating', 'woman', '.'])
('neg', ['you', "'", 'd', 'better', 'choose', 'paul', 'verhoeven', "'", 's', 'even', 'if', 'you', 'have', 'watched', 'it', '.'])
('neg', ['long', ',', 'boring', ',', 'blasphemous', '.', 'never', 'have', 'i', 'been', 'so', 'glad', 'to', 'see', 'ending', 'credits', 'roll', '.'])
('neg', ['a', 'rating', 'of', '1', 'does', 'not', 'begin', 'to', 'express', 'how', 'dull', ',', 'depressing', 'and', 'relentlessly', 'bad', 'this', 'movie', 'is', '.'])
('pos', ['this', 'is', 'the', 'definitive', 'movie', 'version', 'of', 'hamlet', '.', 'branagh', 'cuts', 'nothi

In [6]:
def preprocess(data,vocabidx):
  rr=[]
  for label, tokenlist in data:
    tkl=['<cls>']
    for token in tokenlist:
      tkl.append(token if token in vocabidx else "<unk>")
    tkl.append("<eos>")
    rr.append((label,tkl))
  return rr

train_data = preprocess(train_data,vocabidx)
test_data=preprocess(test_data,vocabidx)
for i in range(10):
  print(train_data[i])

('neg', ['<cls>', 'this', 'movie', 'is', 'terrible', 'but', 'it', 'has', 'some', 'good', 'effects', '.', '<eos>'])
('neg', ['<cls>', 'i', 'wouldn', "'", 't', 'rent', 'this', 'one', 'even', 'on', 'dollar', 'rental', 'night', '.', '<eos>'])
('neg', ['<cls>', 'ming', 'the', 'merciless', 'does', 'a', 'little', '<unk>', 'and', 'a', 'movie', 'most', 'foul', '!', '<eos>'])
('pos', ['<cls>', 'adrian', 'pasdar', 'is', 'excellent', 'is', 'this', 'film', '.', 'he', 'makes', 'a', 'fascinating', 'woman', '.', '<eos>'])
('neg', ['<cls>', 'you', "'", 'd', 'better', 'choose', 'paul', 'verhoeven', "'", 's', 'even', 'if', 'you', 'have', 'watched', 'it', '.', '<eos>'])
('neg', ['<cls>', 'long', ',', 'boring', ',', 'blasphemous', '.', 'never', 'have', 'i', 'been', 'so', 'glad', 'to', 'see', 'ending', 'credits', 'roll', '.', '<eos>'])
('neg', ['<cls>', 'a', 'rating', 'of', '1', 'does', 'not', 'begin', 'to', 'express', 'how', 'dull', ',', 'depressing', 'and', 'relentlessly', 'bad', 'this', 'movie', 'is', '.

In [7]:
def make_batch(data,batchsize):
  bb=[]
  blabel=[]
  btokenlist=[]
  for label, tokenlist in data:
    blabel.append(label)
    btokenlist.append(tokenlist)
    if len(blabel) >= batchsize:
      bb.append((btokenlist,blabel))
      blabel=[]
      btokenlist=[]
  if len(blabel) > 0:
    bb.append((btokenlist,blabel))
  return bb

train_data = make_batch(train_data,BATCHSIZE)
test_data=make_batch(test_data,BATCHSIZE)
for i in range(10):
  print(train_data[i])

([['<cls>', 'this', 'movie', 'is', 'terrible', 'but', 'it', 'has', 'some', 'good', 'effects', '.', '<eos>'], ['<cls>', 'i', 'wouldn', "'", 't', 'rent', 'this', 'one', 'even', 'on', 'dollar', 'rental', 'night', '.', '<eos>'], ['<cls>', 'ming', 'the', 'merciless', 'does', 'a', 'little', '<unk>', 'and', 'a', 'movie', 'most', 'foul', '!', '<eos>'], ['<cls>', 'adrian', 'pasdar', 'is', 'excellent', 'is', 'this', 'film', '.', 'he', 'makes', 'a', 'fascinating', 'woman', '.', '<eos>'], ['<cls>', 'you', "'", 'd', 'better', 'choose', 'paul', 'verhoeven', "'", 's', 'even', 'if', 'you', 'have', 'watched', 'it', '.', '<eos>'], ['<cls>', 'long', ',', 'boring', ',', 'blasphemous', '.', 'never', 'have', 'i', 'been', 'so', 'glad', 'to', 'see', 'ending', 'credits', 'roll', '.', '<eos>'], ['<cls>', 'a', 'rating', 'of', '1', 'does', 'not', 'begin', 'to', 'express', 'how', 'dull', ',', 'depressing', 'and', 'relentlessly', 'bad', 'this', 'movie', 'is', '.', '<eos>'], ['<cls>', 'this', 'is', 'the', 'definitiv

In [8]:
def padding(bb):
  for tokenlists,labels in bb:
    maxlen=max([len(x)for x in tokenlists])
    for tkl in tokenlists:
      for i in range(maxlen-len(tkl)):
        tkl.append("<pad>")
  return bb
train_data = padding(train_data)
test_data=padding(test_data)

for i in range(10):
  print(train_data[i])

([['<cls>', 'this', 'movie', 'is', 'terrible', 'but', 'it', 'has', 'some', 'good', 'effects', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<cls>', 'i', 'wouldn', "'", 't', 'rent', 'this', 'one', 'even', 'on', 'dollar', 'rental', 'night', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], ['<cls>', 'ming', 'the', 'merciless', 'does', 'a', 'little', '<unk>', 'and', 'a', 'movie', 'most', 'foul', '!', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>

In [9]:
def word2id(bb, vocabidx):
  rr=[]
  for tokenlists, labels in bb:
    id_labels = [1 if label=='pos' else 0 for label in labels]
    id_tokenlists=[]
    for tokenlist in tokenlists:
      id_tokenlists.append([vocabidx[token] for token in tokenlist])
    rr.append((id_tokenlists,id_labels))
  return rr
train_data=word2id(train_data,vocabidx)
test_data=word2id(test_data,vocabidx)
for i in range(5):
  print(train_data[i])

([[2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 15, 16, 17, 18, 19, 4, 20, 21, 22, 23, 24, 25, 14, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 26, 27, 28, 29, 30, 31, 0, 32, 30, 5, 33, 34, 35, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 36, 37, 6, 38, 6, 4, 39, 14, 40, 41, 30, 42, 43, 14, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 44, 17, 45, 46, 47, 48, 49, 17, 50, 21, 51, 44, 52, 53, 9, 14, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 54, 55, 56, 55, 57, 14, 58, 52, 15, 59, 60, 61, 62, 63, 64, 65, 66, 14, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 30, 67, 68, 69, 29, 70, 71, 62, 72, 73, 74, 55, 75, 32, 76, 77, 4, 5, 6, 14, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2, 4, 6, 27, 78, 5, 79, 68, 80, 14, 81, 82, 83, 55, 8,

In [10]:
class MyRNN(torch.nn.Module):
  def __init__(self):
    super(MyRNN,self).__init__()
    vocabsize=len(vocablist)
    self.emb=torch.nn.Embedding(vocabsize,300,padding_idx=vocabidx['<pad>'])
    self.l1=torch.nn.Linear(300,300)
    self.l2=torch.nn.Linear(300,2)
  def forward(self,x):
    e=self.emb(x)
    h=torch.zeros(e[0].size(), dtype=torch.float32).to(DEVICE)
    for i in range(x.size()[0]):
      h=F.relu(e[i]+self.l1(h))
    return self.l2(h)

In [11]:
def train():
  model=MyRNN().to(DEVICE)
  optimizer = torch.optim.Adam(model.parameters(),lr=LR)
  for epoch in range(EPOCH):
    loss=0
    for tokenlists, labels in train_data:
      tokenlists=torch.tensor(tokenlists,dtype=torch.int64).transpose(0,1).to(DEVICE)
      labels=torch.tensor(labels,dtype=torch.int64).to(DEVICE)
      optimizer.zero_grad()
      y=model(tokenlists)
      batchloss=F.cross_entropy(y,labels)
      batchloss.backward()
      optimizer.step()
      loss=loss+batchloss.item()

    print('epoch ',epoch," loss ",loss)
  torch.save(model.state_dict(),MODELNAME)

In [12]:
def test():
  total=0
  correct=0
  model=MyRNN().to(DEVICE)
  model.load_state_dict(torch.load(MODELNAME))
  model.eval()
  for tokenlists,labels in test_data:
    total+=len(labels)
    tokenlists=torch.tensor(tokenlists,dtype=torch.int64).transpose(0,1).to(DEVICE)
    labels=torch.tensor(labels,dtype=torch.int64).to(DEVICE)
    y=model(tokenlists)
    pred_labels=y.max(dim=1)[1]
    correct+=(pred_labels==labels).sum()
  print("correct ",correct.item())
  print("total ",total)
  print("accu", (correct.item()/float(total)))

In [13]:
train()

epoch  0  loss  254.5536586344242
epoch  1  loss  240.02610030770302
epoch  2  loss  238.14317044615746
epoch  3  loss  237.37758296728134
epoch  4  loss  236.8774155676365
epoch  5  loss  236.4855078458786
epoch  6  loss  236.14370599389076
epoch  7  loss  235.82157424092293
epoch  8  loss  235.5112748593092
epoch  9  loss  235.20881068706512


In [14]:
test()

correct  16944
total  25000
accu 0.67776
