<a href="https://colab.research.google.com/github/mk-a/NLP_Duty1/blob/master/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
import pickle
import torch.nn as nn
import time
import gc
import copy

In [16]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

cuda


In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
data = pd.read_pickle("/content/drive/My Drive/Colab Notebooks/IFT6285/train_post_preprocessed.pkl")

In [0]:
def text2seq(X, y, vocab_max, seq_len, unk_tok='UNK', padd_tok='PAD' ):
  """ For a list of inputs X with different sequence length, associated to the labels y.
      Builds the list of the vocab_max-2 most common words. Maps each of them to a number.
      Replace the other words by the token unk_tok, which is mapped to the value 1.
      Then force the input to have a sequence length of seq_len with this policy:
        - If the sample x is short than seq_len, then adds padding at the begining.
          The padding token is set by the variable padd_tok. And mapped to the value 0.
        - If the sample x is longer than seq_len, then extracts (len(x)//seq_len)+1
          sequences from x.
  """
  count_words = Counter()
  for post in X:
    for tok in post:
      count_words[tok] += 1
  vocab = {w for w,_ in count_words.most_common(vocab_max-2)}.union( {unk_tok, padd_tok})
  word2val = {w : i+2 for i, (w,_) in enumerate( count_words.most_common(vocab_max-2) )}
  word2val[padd_tok] = 0
  word2val[unk_tok] = 1

  X2 = []
  y2 = []
  for i, x in enumerate(X):
    if len(x) < seq_len:
      tmp = x.copy()
      for _ in range(seq_len - len(x)):
        tmp.insert(0, padd_tok)
      X2.append([word2val[tok] if tok in vocab else 1 for tok in tmp])
      y2.append(y[i])
    elif len(x) == seq_len:
      X2.append([word2val[tok] if tok in vocab else 1 for tok in x]) 
      y2.append(y[i])
    else :
      for j in range(len(x)//seq_len):
        X2.append([word2val[tok] if tok in vocab else 1 for tok in  x[j*seq_len:(j+1)*seq_len]])
        y2.append(y[i])
      X2.append([word2val[tok] if tok in vocab else 1 for tok in  x[-seq_len:]])
      y2.append(y[i])
  return word2val, X2, y2

In [21]:
vocab_max = 284467
word2val, X, y = text2seq(data.text.values, data.label.values, vocab_max, 200) 
del data
gc.collect()

0

In [0]:
with open('/content/drive/My Drive/Colab Notebooks/IFT6285/word2val_{}.pkl'.format(vocab_max), 'wb') as handle:
    pickle.dump(word2val, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('/content/drive/My Drive/Colab Notebooks/IFT6285/X_v{}_s200.pkl'.format(vocab_max), 'wb') as handle:
    pickle.dump(X, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('/content/drive/My Drive/Colab Notebooks/IFT6285/y_s200.pkl', 'wb') as handle:
    pickle.dump(y, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
# with open('/content/drive/My Drive/Colab Notebooks/IFT6285/word2val_80k.pkl', 'rb') as handle:
#     word2val = pickle.load(handle)
# with open('/content/drive/My Drive/Colab Notebooks/IFT6285/X_v80k_s200.pkl', 'rb') as handle:
#     X = pickle.load(handle)
# with open('/content/drive/My Drive/Colab Notebooks/IFT6285/y_s200.pkl', 'rb') as handle:
#     y = pickle.load(handle)
# vocab_max = 80000

In [0]:
def downsample(df, colum_name):
    size_min = float('inf')
    list_df = []
    labels = df[colum_name].unique()
    for label in labels:
        size = len(df.loc[df[colum_name] == label])
        if size < size_min:
            size_min = size
    for label in labels:
        list_df.append(df.loc[df[colum_name] == label].sample(size_min))
    return pd.concat(list_df).sample(frac=1).reset_index(drop=True)

In [28]:
# df = downsample(pd.DataFrame({'X':X, 'y':y}), 'y')
# X2 = list(df.X.values)
# y2 = list(df.y.values)
# del df
# gc.collect()

68

In [0]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)
# X_train, X_valid, y_train, y_valid = train_test_split(X2, y2, test_size=0.33, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size=0.5, random_state=42)
X_train = torch.tensor(X_train)
X_valid = torch.tensor(X_valid)
X_test = torch.tensor(X_test)
y_train = torch.tensor(y_train)
y_valid = torch.tensor(y_valid)
y_test = torch.tensor(y_test)
train_data = torch.utils.data.TensorDataset(X_train, y_train)
valid_data = torch.utils.data.TensorDataset(X_valid, y_valid)
test_data = torch.utils.data.TensorDataset(X_test, y_test)

In [0]:
batch_size = 256

train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = torch.utils.data.DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [0]:
class RNN(nn.Module):
    def __init__(self,
                 emb_size,
                 hidden_size,
                 vocab_size,
                 num_layers,
                 output_size,
                 nonlinearity = 'relu',
                 bias = True,
                 dropout = 0,
                 bidirectional = True
                ):
        super(RNN, self).__init__()
        #hyper-parameters
        self.emb_size      = emb_size
        self.hidden_size   = hidden_size
        self.vocab_size    = vocab_size
        self.num_layers    = num_layers
        self.output_size   = output_size
        self.nonlinearity  = nonlinearity
        self.bias          = bias
        self.dropout       = dropout
        self.bidirectional = bidirectional
        #layers
        self.embedding = nn.Embedding(vocab_size, emb_size).to(device)
        self.rnn = nn.GRU(input_size = emb_size,
                          hidden_size = hidden_size,
                          num_layers = num_layers,
                          bias = bias,
                          dropout = dropout,
                          bidirectional = bidirectional
                         ).to(device)
        self.linear = nn.Linear((bidirectional+1) *hidden_size, output_size).to(device)
        
    def forward(self, X):
        out = self.embedding(X.t())
        out, _ = self.rnn(out)
        return self.linear(out[-1])

In [0]:
def compute_accuracy(model, data_loader):
  correct = 0
  for i, (X_batch, y_batch) in enumerate(data_loader):
    torch.cuda.empty_cache()
    X_batch = X_batch.to(device)
    y_batch = y_batch.to(device)
    out = model.forward(X_batch.to(device))
    correct += (torch.max(out, 1)[1] == y_batch).float().sum()
  return((100*correct/len(test_loader.dataset)).item())

In [0]:
def run_epoch(n_epochs, model, criterion, optimizer, train_loader, valid_loader):
  start_time = time.time()
  last_time = time.time()
  prev_valid_acc = 0
  prev_state_dict = None
  for epoch in range(1, n_epochs+1):
      print("Epoch: {}/{}".format(epoch, n_epochs))
      running_loss = 0
      correct = 0
      for i, (X_batch, y_batch) in enumerate(train_loader):
          torch.cuda.empty_cache()
          X_batch = X_batch.to(device)
          y_batch = y_batch.to(device)
          optimizer.zero_grad()
          out = model.forward(X_batch.to(device))
          loss = criterion(out, y_batch)
          loss.backward()
          optimizer.step()
          running_loss += loss.item()
          correct += (torch.max(out, 1)[1] == y_batch).float().sum()
          if time.time() - last_time > 30:
              print("Samples:{}/{}\tloss: {:.4f}\tacc: {:2.3f}\telapsed_time: {:.1f}s"\
                    .format( (i+1)*batch_size, len(train_loader.dataset),\
                            running_loss/((i+1)*batch_size), 100*correct/((i+1)*batch_size), time.time()-start_time), end='\r')
              last_time = time.time()
      valid_acc = compute_accuracy(model,valid_loader)
      print("loss: {:.6f}\tacc: {:2.3f}\telapsed_time: {:.1f}s\tvalid_acc: {:2.3f}".format(
          running_loss/((i+1)*batch_size), 100*correct/((i+1)*batch_size), time.time()-start_time, valid_acc))
      if prev_valid_acc > valid_acc:
        print("Stopping criteria met. Returning the model state of the previous epoch.")
        model.load_state_dict(prev_state_dict)
        return
      prev_valid_acc  = valid_acc
      prev_state_dict = copy.deepcopy(model.state_dict())

In [0]:
model = 'GRU'
emb_size = 64
hidden_size = 64
num_layers = 6
output_size = 3
bidirectional=True
filename = '{}{}_emb{}_hid{}_lay{}_vocab{}.pt'.format('bi' if bidirectional else '', model, emb_size, hidden_size, num_layers, vocab_max )


rnn = RNN(emb_size = emb_size, hidden_size = hidden_size, vocab_size = vocab_max,
          num_layers = num_layers, output_size = output_size, bidirectional=bidirectional)

In [0]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), weight_decay=10e-5)

In [0]:
run_epoch(n_epochs=20, model=rnn, criterion=criterion, optimizer=optimizer, train_loader=train_loader, valid_loader=valid_loader)

Epoch: 1/20
loss: 0.003339	acc: 60.185	elapsed_time: 509.7s	valid_acc: 63.449
Epoch: 2/20
loss: 0.002988	acc: 65.017	elapsed_time: 1015.8s	valid_acc: 66.054
Epoch: 3/20
loss: 0.002789	acc: 67.718	elapsed_time: 1521.1s	valid_acc: 69.156
Epoch: 4/20
loss: 0.002627	acc: 69.896	elapsed_time: 2025.7s	valid_acc: 70.402
Epoch: 5/20
loss: 0.002525	acc: 71.327	elapsed_time: 2536.7s	valid_acc: 70.956
Epoch: 6/20


In [0]:
compute_accuracy(rnn, valid_loader)

In [30]:
path = '/content/drive/My Drive/Colab Notebooks/IFT6285/'
torch.save(rnn, path+filename)

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
