## Clone the GitHub Repository

In [1]:
!git clone https://github.com/mehedihasanbijoy/PyTorch-NLP-Tutorial.git

fatal: destination path 'PyTorch-NLP-Tutorial' already exists and is not an empty directory.


## Load the dataset

In [2]:
import pandas as pd 

df = pd.read_csv('/content/PyTorch-NLP-Tutorial/1. Text Classification/corpus/TweetSentiment.csv')
df = df[['preprocessed_text', 'label']]
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.sample(5)

Unnamed: 0,preprocessed_text,label
23080,happy mother day to all mom out there,2
7837,ehi might drive through chitown on my way to c...,1
25464,awake sadly seeing leon today,0
29699,good morning been here since am just quiet how...,2
15838,game who want itu already know who the league ...,1


## Split the dataset into train and test sets

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['preprocessed_text'].tolist(),
    df['label'].tolist(),
    test_size = 0.2,
    stratify = df['label'].tolist(),
    random_state = 64
)

In [4]:
def find_len(X):
    return len(X.split())

def sort_by_length(X, y):
    df = pd.DataFrame({'X': X, 'y': y})
    df['len'] = df['X'].apply(find_len)
    df = df.sort_values(by='len', ascending=True)
    return list(df['X']), list(df['y'])

In [5]:
X_train, y_train = sort_by_length(X_train, y_train)
X_test, y_test = sort_by_length(X_test, y_test)

In [6]:
from collections import Counter

# print the statistics of train and test sets
print(f'Train data instances: {len(X_train)}\nClass distribution: {Counter(y_train)}')
print(f'\nTest data instances: {len(X_test)}\nClass distribution: {Counter(y_test)}')

Train data instances: 24808
Class distribution: Counter({1: 10035, 2: 7748, 0: 7025})

Test data instances: 6202
Class distribution: Counter({1: 2509, 2: 1937, 0: 1756})


In [7]:
import random

# create iterator: list of tuples -> (label, text)
train_data = list(zip(y_train, X_train))
test_data = list(zip(y_test, X_test))

# display training samples
random.choices(train_data, k = 5)

[(0,
  'darn it im craving wedding cake a craving thats very hard to satisfy with anything else'),
 (1, 'tonight is the last jay leno late nigt show'),
 (0,
  'drat all my land boot died at once i think of boot like parsley one going to seed the other first season what to do now'),
 (1, 'bandoni ok see you at da climbing'),
 (0, 'i wa scared by the daleks in the dw exhibition in cardiff')]

## DataLoader

In [8]:
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
# from torchtext.data.utils import get_tokenizer
# tokenizer = get_tokenizer('basic_english')

def tokenizer(x):
    return x.lower().split()

def yield_tokens(data_iterator):
    for _, text in data_iterator:
        yield tokenizer(text)

In [10]:
from torchtext.vocab import build_vocab_from_iterator

# build vocabulary
VOCAB = build_vocab_from_iterator(yield_tokens(train_data), specials=[''])
VOCAB.set_default_index(VOCAB[''])

In [11]:
# create pipelines
TEXT_PIPELINE = lambda x: VOCAB(tokenizer(x))
LABEL_PIPELINE = lambda x: int(x)

# pipelines in action
print(TEXT_PIPELINE('This is an example'))
print(LABEL_PIPELINE('2'))

[29, 9, 87, 5001]
2


In [12]:
# batch collate function
def collate_batch(batch):
    labels, texts, offsets = [], [], [0]
    for (label, text) in batch:
        labels.append(LABEL_PIPELINE(label))
        _texts = torch.tensor(TEXT_PIPELINE(text), dtype=torch.int64)
        texts.append(_texts)
        offsets.append(_texts.size(0))
    labels = torch.tensor(labels, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    texts = torch.cat(texts)
    return labels.to(DEVICE), texts.to(DEVICE), offsets.to(DEVICE)

In [13]:
from torch.utils.data import DataLoader

# hyperparameters
EPOCHS = 25
LEARNING_RATE = 0.5
BATCH_SIZE = 64

# dataloaders
train_loader = DataLoader(train_data, batch_size = BATCH_SIZE, shuffle = True, collate_fn = collate_batch)  # train data is train iterator
test_loader = DataLoader(test_data, batch_size = BATCH_SIZE, shuffle = True, collate_fn = collate_batch)  # test data is test iterator

## Text Classification Model
A feed-forward neural network

In [14]:
from torch import nn
import torch.nn.functional as F

class FeedForwardNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(FeedForwardNN, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc1 = nn.Linear(embed_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.fc4 = nn.Linear(16, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.68
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc2.bias.data.zero_()
        self.fc3.weight.data.uniform_(-initrange, initrange)
        self.fc3.bias.data.zero_()
        self.fc4.weight.data.uniform_(-initrange, initrange)
        self.fc4.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        x = F.relu(self.fc1(embedded))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [15]:
NUM_CLASSES = len(set([label for (label, text) in train_data]))
VOCAB_SIZE = len(VOCAB)
EMBED_SIZE = 128

# initialize the model
model = FeedForwardNN(VOCAB_SIZE, EMBED_SIZE, NUM_CLASSES).to(DEVICE)

In [16]:
# loss fn, optimizer, scheduler
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

## Train and Evaluate the Model

In [17]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 100
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

In [18]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [19]:
total_accu = None

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_loader)
    accu_val = evaluate(test_loader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'test accuracy {:8.3f} '.format(epoch, time.time() - epoch_start_time, accu_val))
    print('-' * 59)

| epoch   1 |   100/  388 batches | accuracy    0.359
| epoch   1 |   200/  388 batches | accuracy    0.370
| epoch   1 |   300/  388 batches | accuracy    0.391
-----------------------------------------------------------
| end of epoch   1 | time:  2.26s | test accuracy    0.424 
-----------------------------------------------------------
| epoch   2 |   100/  388 batches | accuracy    0.414
| epoch   2 |   200/  388 batches | accuracy    0.428
| epoch   2 |   300/  388 batches | accuracy    0.426
-----------------------------------------------------------
| end of epoch   2 | time:  3.86s | test accuracy    0.454 
-----------------------------------------------------------
| epoch   3 |   100/  388 batches | accuracy    0.438
| epoch   3 |   200/  388 batches | accuracy    0.444
| epoch   3 |   300/  388 batches | accuracy    0.465
-----------------------------------------------------------
| end of epoch   3 | time:  3.96s | test accuracy    0.471 
----------------------------------

## Test the Model on Input Text

In [20]:
sentiment_label = {2: "Positive", 0: "Negative", 1: "Neutral"}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() 

In [21]:
# inp_text = "Soooooo wish I could, but im in school and myspace is completely blocked"
# inp_text = "The product is not good"
inp_text = "It's super fun"

print(f"This is a {sentiment_label[predict(inp_text, TEXT_PIPELINE)]} tweet")

This is a Positive tweet


## References

In [22]:
# https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html