In [1]:
import numpy as np 
import pandas as pd 
# list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/rotten-tomatoes-reviews-dataset/rt-polarity-no-header.csv
/kaggle/input/rotten-tomatoes-reviews-dataset/data_rt.csv


In [2]:
# read the data with shuffling
root = "/kaggle/input/rotten-tomatoes-reviews-dataset/"
filename = "rt-polarity-no-header.csv"
data = pd.read_csv(root+filename).sample(
    frac = 1, random_state = 42)
data.head()

Unnamed: 0,0,"simplistic , silly and tedious ."
6830,1,another one of those estrogen overdose movies ...
8600,1,scott delivers a terrific performance in this ...
4080,0,i didn't find much fascination in the swinging...
3079,0,if you're not the target demographic . . . thi...
582,0,"simply put , there should have been a more com..."


In [3]:
# with specifications customize rt-polarity data into a torchtext dataset

from torchtext.utils import download_from_url, extract_archive
from torchtext.data.datasets_utils import _RawTextIterableDataset
from torchtext.data.datasets_utils import _wrap_split_argument
from torchtext.data.datasets_utils import _add_docstring_header
from torchtext.data.datasets_utils import _find_match
from torchtext.data.datasets_utils import _create_dataset_directory
from torchtext.data.datasets_utils import _create_data_from_csv
import os

NUM_LINES = {
    'train': 8662,
    'test': 2000,
}

DATASET_NAME = "RTPolarity"


@_add_docstring_header(num_lines=NUM_LINES, num_classes=2)
# @_create_dataset_directory(dataset_name=DATASET_NAME) # uncomment when working on your own directories
@_wrap_split_argument(('train', 'test'))
def RTPolarity(root, split):
    path = root # bad function def. but pytorch function does not accept 3 inputs
    return _RawTextIterableDataset(DATASET_NAME, NUM_LINES[split],
                                   _create_data_from_csv(root+filename))

In [4]:
train_iter = RTPolarity(root = root, split = "train")
print(len(train_iter))

8662


In [5]:
next(train_iter)

(0, 'simplistic , silly and tedious . ')

In [6]:
# import tokenizer and pretrained word embeddings
from torchtext.data.utils import get_tokenizer
from torch.utils.data import DataLoader, Dataset

tokenizer = get_tokenizer("basic_english")
from torchtext.vocab import GloVe
glove = GloVe(name='6B', dim=300)

.vector_cache/glove.6B.zip: 862MB [02:42, 5.29MB/s]                           
100%|█████████▉| 399999/400000 [00:53<00:00, 7413.37it/s]


In [7]:
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

In [8]:
# create vocabulary for the dataset 
from torchtext.vocab import build_vocab_from_iterator
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [9]:
vocab(["<unk>", "<pad>", "silly"])

[0, 1, 264]

In [10]:
def getMxLen(data_iter):
    MAXLEN = 0
    for _, text in data_iter:
        tmp = tokenizer(text)
        if(len(tmp) > MAXLEN):
            MAXLEN = len(tmp)
    return MAXLEN

In [11]:
train_iter = RTPolarity(root = root, split = "train")
MAXLEN = getMxLen(train_iter)
print("Maximum sentence length is ", MAXLEN)

Maximum sentence length is  62


In [12]:
# define the pipelines
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)

In [13]:
from torch.utils.data import DataLoader
device = "cpu" #torch.device("cuda" if torch.cuda.is_available() else "cpu")

# batch collator for torch dataloaders
def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        #print(processed_text.shape)
        # add padding here 
        pp = torch.ones([MAXLEN - processed_text.shape[0]])
        text_list.append(torch.cat((processed_text, pp)))
        # offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    # offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.stack(text_list).int()
    return label_list.to(device), text_list.to(device)
train_iter = RTPolarity(split="train")
train_dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [14]:
# get samples from a dataloder
import torch
train_iter = RTPolarity(root = root, split="train")
train_dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)
ty = None
tx = None
shape = None
cnt = 0
for y, X in train_dataloader:
    if(cnt == 0):
      ty, tx = y, X
      print(ty.shape, tx.shape)
      shapex = tx.shape
      shapey = ty.shape
    assert(shapex == tx.shape)
    assert(shapey == ty.shape)
    cnt += 1

torch.Size([8]) torch.Size([8, 62])


In [15]:
# create embedding matrix 
matrix_len = len(vocab)
weights_matrix = np.zeros((matrix_len, 300))
words_found = 0

for i, word in enumerate(vocab.get_itos()):
    try: 
        weights_matrix[i] = glove.get_vecs_by_tokens(word)
        words_found += 1
    except KeyError:
        weights_matrix[i] = glove.get_vecs_by_tokens("<unk>")
print(weights_matrix.shape, words_found)

(18339, 300) 18339


In [16]:
# defining the embedding layer
import torch.nn as nn
def create_emb_layer(weights_matrix, freeze = True):
    num_embeddings, embedding_dim = weights_matrix.shape[0], weights_matrix.shape[1]
    emb_layer = nn.Embedding.from_pretrained(torch.Tensor(weights_matrix), freeze = freeze, sparse = True)
    #emb_layer.load_state_dict({'weight': weights_matrix})

    return emb_layer, num_embeddings, embedding_dim

In [17]:
# Define model
class NeuralNetwork(nn.Module):
    def __init__(self, weights_matrix):
        # define the attributes (class members)
        super(NeuralNetwork, self).__init__()
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
        print(num_embeddings, embedding_dim)
        trans_enc_layer = nn.TransformerEncoderLayer(d_model = embedding_dim, nhead = 3 ,batch_first=True)
        self.transformer = nn.TransformerEncoder(trans_enc_layer, num_layers=2)
        self.fc1 = nn.Linear(18600, 1)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.flatten = nn.Flatten()
    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        x = self.flatten(x)
        x = self.relu(self.fc1(x))
        x = self.sigmoid(x)
        return x

In [18]:
model = NeuralNetwork(weights_matrix)
print(model)
print("Number of trainable parameters is ", 
      sum(p.numel() for p in model.parameters() if p.requires_grad))

18339 300
NeuralNetwork(
  (embedding): Embedding(18339, 300, sparse=True)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
        )
        (linear1): Linear(in_features=300, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=300, bias=True)
        (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
        )
        (linear1): Linear(in_features=300, 

In [19]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 100
    start_time = time.time()

    for idx, (label, text) in enumerate(dataloader):
        label, text = label.to(device), text.to(device)
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label.reshape(-1,1).float())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            label, text = label.to(device), text.to(device)
            predicted_label = model(text)
            loss = criterion(predicted_label, label.reshape(-1,1).float())
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [20]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
# Hyperparameters
EPOCHS = 1 # epoch
LR = 0.001  # learning rate
BATCH_SIZE = 16 # batch size for training

criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = RTPolarity(root = root)
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |   100/  515 batches | accuracy    0.637
| epoch   1 |   200/  515 batches | accuracy    0.608
| epoch   1 |   300/  515 batches | accuracy    0.618
| epoch   1 |   400/  515 batches | accuracy    0.603
| epoch   1 |   500/  515 batches | accuracy    0.621
-----------------------------------------------------------
| end of epoch   1 | time: 116.26s | valid accuracy    0.601 
-----------------------------------------------------------
