<a href="https://colab.research.google.com/github/myomyint-maung/nlp-assignments/blob/main/05-Sentiment-Analysis/05-Sentiment-Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import torch, torchdata, torchtext
from torch import nn
import time

In [2]:
# Choose computing device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
# Set SEED for reproducibility
SEED = 786
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### 1. ETL

In [4]:
# Load AG_NEWS dataset from torchtext
from torchtext.datasets import  AG_NEWS
train_set, test_set = AG_NEWS()

### 2. EDA

In [5]:
# Check a sample from the training dataset
next(iter(train_set))

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [6]:
# Check the classes of labels in the trianing dataset
set([y for y, x in list(iter(train_set))])

{1, 2, 3, 4}

In [7]:
# Check the size of the datasets
train_size = len(list(iter(train_set)))
test_size = len(list(iter(test_set)))

train_size, test_size

(120000, 7600)

In [8]:
# Split the training dataset into training and validation data
train_data, val_data = train_set.random_split(total_length = train_size,
                                  weights = {"train_data": 0.8,
                                             "val_data": 0.2},
                                  seed = SEED)

In [9]:
# Check the sizes of the training and validation data
train_size = len(list(iter(train_data)))
val_size = len(list(iter(val_data)))

train_size, val_size

(96000, 24000)

### 3. Preprocessing

#### 3.1. Tokenization

In [10]:
# Import get_tokenizer module and create tokenizer
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [11]:
# Create a function to get tokens out of datapipe objects
def yield_tokens(data_iter):
  for _, text in data_iter:
    yield tokenizer(text)

#### 3.2. Numericalization

In [12]:
# Import vocab builder module
from torchtext.vocab import build_vocab_from_iterator

In [13]:
# Create vocab out of the training set
vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=['<unk>',
                                                                 '<pad>',
                                                                 '<bos>',
                                                                 '<eos>'])

In [14]:
# Set <unk> as the default index of the vocab
vocab.set_default_index(vocab['<unk>'])

In [15]:
# Make idex2word dictionary
index2word = vocab.get_itos()

In [16]:
vocab(['<unk>', '<pad>', '<bos>', '<eos>'])

[0, 1, 2, 3]

In [17]:
index2word[0:4]

['<unk>', '<pad>', '<bos>', '<eos>']

In [18]:
len(vocab), len(index2word)

(100157, 100157)

### 4. FastText Embedding

In [19]:
# Import FastText module
from torchtext.vocab import FastText

In [20]:
# Load FastText embeddings
fast_vectors = FastText(language='simple')

In [21]:
# Select FastText embeddings for the vocab
fast_embeddings = fast_vectors.get_vecs_by_tokens(index2word).to(device)

In [22]:
fast_embeddings.shape

torch.Size([100157, 300])

In [23]:
index2word[100]

'could'

In [24]:
fast_embeddings[100]

tensor([ 0.0037,  0.1285,  0.0932, -0.2673,  0.1011,  0.1499,  0.1140,  0.1750,
        -0.0178,  0.4580, -0.0826,  0.0063, -0.0187, -0.2067,  0.0396, -0.2027,
         0.0928,  0.1267, -0.1514, -0.2401,  0.0364,  0.4860, -0.1342, -0.1955,
         0.1271,  0.0131,  0.0889, -0.1578,  0.1628,  0.0940,  0.0730,  0.0811,
         0.2384,  0.2504,  0.1369,  0.0018,  0.0270, -0.2334,  0.0319, -0.0582,
         0.0489, -0.0808,  0.4670, -0.1315, -0.0889,  0.0124, -0.1562, -0.1325,
         0.1588,  0.2278,  0.0296, -0.1468, -0.0101,  0.1050,  0.1034, -0.1700,
        -0.0410,  0.2820,  0.0088,  0.2146,  0.0196, -0.0028,  0.0834, -0.2325,
        -0.1137, -0.2882, -0.2333, -0.0011,  0.2132, -0.1755,  0.0518,  0.2206,
         0.0532,  0.2875,  0.0120, -0.1471, -0.0881,  0.1321,  0.0479,  0.0872,
        -0.0632,  0.0183, -0.2970, -0.1001,  0.1149, -0.2211, -0.1006,  0.1413,
         0.1636, -0.1715,  0.1063,  0.0223,  0.1345,  0.0137, -0.1597,  0.2564,
        -0.2729,  0.2309,  0.0853,  0.23

### 5. Preparing Dataloader

In [25]:
# Import DataLoader and pad_sequence from PyTorch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [26]:
# Create a function to collate batches of data
def collate_batch(batch):
  label_list, text_list, length_list = [], [], []

  label_pipeline = lambda x: int(x) - 1
  text_pipeline = lambda x: vocab(tokenizer(x))

  for (_label, _text) in batch:
    label_list.append(label_pipeline(_label))
    processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
    text_list.append(processed_text)
    length_list.append(processed_text.size(0))
  
  return torch.tensor(label_list, dtype=torch.int64), \
         pad_sequence(text_list, padding_value=vocab['<pad>'], batch_first=True), \
         torch.tensor(length_list, dtype=torch.int64)

In [27]:
batch_size = 64

train_loader = DataLoader(train_data, batch_size=batch_size,
                          shuffle=True, collate_fn=collate_batch)

val_loader = DataLoader(val_data, batch_size=batch_size,
                        shuffle=True, collate_fn=collate_batch)

test_loader = DataLoader(test_set, batch_size=batch_size,
                        shuffle=True, collate_fn=collate_batch)

In [28]:
for label, text, length in train_loader:
  break

label, text, length

(tensor([2, 2, 0, 2, 0, 2, 0, 3, 2, 2, 2, 0, 3, 3, 2, 1, 0, 2, 0, 1, 1, 0, 2, 2,
         0, 1, 2, 1, 0, 3, 0, 2, 3, 0, 3, 3, 1, 0, 1, 0, 2, 3, 3, 0, 3, 1, 3, 2,
         3, 1, 1, 1, 3, 1, 3, 2, 3, 1, 2, 3, 2, 1, 1, 0]),
 tensor([[  973,   605,    32,  ...,     1,     1,     1],
         [12029,  5046, 12726,  ...,     1,     1,     1],
         [19742,    77,  7856,  ...,     1,     1,     1],
         ...,
         [73383, 15790,    16,  ...,     1,     1,     1],
         [  240,     5,    67,  ...,     1,     1,     1],
         [ 1779,     7,  7799,  ...,     1,     1,     1]]),
 tensor([46, 49, 25, 44, 53, 48, 82, 57, 40, 46, 45, 50, 21, 48, 44, 56, 46, 52,
         38, 37, 49, 29, 67, 50, 42, 42, 45, 45, 45, 45, 49, 35, 36, 40, 28, 34,
         77, 42, 45, 42, 51, 34, 91, 50, 43, 37, 49, 31, 29, 38, 46, 38, 87, 59,
         57, 38, 42, 48, 61, 18, 57, 42, 34, 56]))

In [29]:
label.shape, text.shape, length.shape

(torch.Size([64]), torch.Size([64, 91]), torch.Size([64]))

### 6. Designing the Model

In [30]:
# Create the LSTM model
class LSTM(nn.Module):

  def __init__(self, input_dim, emb_dim, hid_dim, output_dim, num_layers, bidirectional, dropout):
    super().__init__()

    self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=vocab['<pad>'])
    
    self.lstm = nn.LSTM(emb_dim, hid_dim,
                        num_layers = num_layers,
                        bidirectional = bidirectional,
                        dropout = dropout,
                        batch_first = True)
    
    self.fc = nn.Linear(hid_dim * 2, output_dim)
  
  def forward(self, x, lengths):

    embedded_x = self.embedding(x)

    pack_embedded = nn.utils.rnn.pack_padded_sequence(embedded_x,
                                                      lengths.to('cpu'),
                                                      enforce_sorted=False,
                                                      batch_first=True)
    
    packed_output, (h, c) = self.lstm(pack_embedded)

    output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)

    last_hidden_state = torch.cat((h[-1,:,:], h[-2,:,:]), dim=1)

    return self.fc(last_hidden_state)