## Clone the GitHub Repository

In [None]:
!git clone https://github.com/mehedihasanbijoy/PyTorch-BanglaNLP-Tutorial.git

Cloning into 'PyTorch-BanglaNLP-Tutorial'...
remote: Enumerating objects: 188, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 188 (delta 5), reused 0 (delta 0), pack-reused 176[K
Receiving objects: 100% (188/188), 3.02 MiB | 16.38 MiB/s, done.
Resolving deltas: 100% (94/94), done.


## Load the dataset

In [None]:
import pandas as pd 

df = pd.read_csv('/content/PyTorch-BanglaNLP-Tutorial/0A. Corpus/BanglaEmotion/BanglaEmotion.csv')
df = df[['cleaned_text', 'label']]
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.sample(5)

Unnamed: 0,cleaned_text,label
2939,যাই হোক আগে আপনার পক্ষেই ছিলাম যেখানে আপনি অপর...,3
3917,বাংলাদেশ সকল বাঙ্গালির দেশ ধন্যবাদ,3
3692,নাহ। আপনার মধ্যে আমি ইসলামের প্রতি শুদ্ধশীলতা ...,4
4579,ভাল করেছে,3
4587,সব সম্ভবের দেশ বাংলাদেশ,3


## Split the dataset into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'].tolist(),
    df['label'].tolist(),
    test_size = 0.2,
    stratify = df['label'].tolist(),
    random_state = 64
)

In [None]:
def find_len(X):
    return len(X.split())

def sort_by_length(X, y):
    df = pd.DataFrame({'X': X, 'y': y})
    df['len'] = df['X'].apply(find_len)
    df = df.sort_values(by='len', ascending=True)
    return list(df['X']), list(df['y'])

In [None]:
X_train, y_train = sort_by_length(X_train, y_train)
X_test, y_test = sort_by_length(X_test, y_test)

In [None]:
from collections import Counter

# print the statistics of train and test sets
print(f'Train data instances: {len(X_train)}\nClass distribution: {Counter(y_train)}')
print(f'\nTest data instances: {len(X_test)}\nClass distribution: {Counter(y_test)}')

Train data instances: 4511
Class distribution: Counter({3: 1440, 0: 960, 4: 959, 1: 480, 5: 384, 2: 288})

Test data instances: 1128
Class distribution: Counter({3: 360, 4: 240, 0: 240, 1: 120, 5: 96, 2: 72})


In [None]:
import random

# create iterator: list of tuples -> (label, text)
train_data = list(zip(y_train, X_train))
test_data = list(zip(y_test, X_test))

# display training samples
random.choices(train_data, k = 5)

[(4,
  'অনেক দিন পর আপনার লেখা ভালো ভাবে নিতে পারিনি। একজন ভালো মানুষের জন্য লিখলে ভালো হত।'),
 (1, 'তোমাকে কত বার তারা ব্যাবহার করেছে তা আমরা ঠিক ই বুজে গেছি,'),
 (3,
  'ইমরান এইচ সরকার এর ঊন্নতি হইছে দেখতাছি।যাক দেরিতে হলেও বোধোদয় হইছে ।সরকারের দালালি থেকে সরে এসে হক কথা বলার জন্য থেংকু।'),
 (4, 'নিশেদাজ্ঞা তো ভাল ছিল জাত্রি হয়রানি আবার বারবে,'),
 (5,
  'আপনার মতে কি দেশের সব ম্যাজিস্ট্রেট সাধু? খবর নিয়ে দেখেন তারা টাকাকড়ি খায় কিনা')]

## DataLoader

In [None]:
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# from torchtext.data.utils import get_tokenizer
# tokenizer = get_tokenizer('basic_english')

def tokenizer(x):
    return x.lower().split()

def yield_tokens(data_iterator):
    for _, text in data_iterator:
        yield tokenizer(text)

In [None]:
from torchtext.vocab import build_vocab_from_iterator

# build vocabulary
VOCAB = build_vocab_from_iterator(yield_tokens(train_data), specials=[''])
VOCAB.set_default_index(VOCAB[''])

In [None]:
# create pipelines
TEXT_PIPELINE = lambda x: VOCAB(tokenizer(x))
LABEL_PIPELINE = lambda x: int(x)

# pipelines in action
print(TEXT_PIPELINE('একজন ভালো মানুষের জন্য লিখলে ভালো হত।'))
print(LABEL_PIPELINE('2'))

[92, 67, 94, 9, 3604, 67, 1513]
2


In [None]:
# batch collate function
def collate_batch(batch):
    labels, texts, offsets = [], [], [0]
    for (label, text) in batch:
        labels.append(LABEL_PIPELINE(label))
        _texts = torch.tensor(TEXT_PIPELINE(text), dtype=torch.int64)
        texts.append(_texts)
        offsets.append(_texts.size(0))
    labels = torch.tensor(labels, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    texts = torch.cat(texts)
    return labels.to(DEVICE), texts.to(DEVICE), offsets.to(DEVICE)

In [None]:
from torch.utils.data import DataLoader

# hyperparameters
EPOCHS = 25
LEARNING_RATE = 0.5
BATCH_SIZE = 64

# dataloaders
train_loader = DataLoader(train_data, batch_size = BATCH_SIZE, shuffle = True, collate_fn = collate_batch)  # train data is train iterator
test_loader = DataLoader(test_data, batch_size = BATCH_SIZE, shuffle = True, collate_fn = collate_batch)  # test data is test iterator

## Text Classification Model
A feed-forward neural network

In [None]:
from torch import nn
import torch.nn.functional as F

class FeedForwardNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(FeedForwardNN, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc1 = nn.Linear(embed_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.fc4 = nn.Linear(16, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.68
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc2.bias.data.zero_()
        self.fc3.weight.data.uniform_(-initrange, initrange)
        self.fc3.bias.data.zero_()
        self.fc4.weight.data.uniform_(-initrange, initrange)
        self.fc4.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        x = F.relu(self.fc1(embedded))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [None]:
NUM_CLASSES = len(set([label for (label, text) in train_data]))
VOCAB_SIZE = len(VOCAB)
EMBED_SIZE = 128

# initialize the model
model = FeedForwardNN(VOCAB_SIZE, EMBED_SIZE, NUM_CLASSES).to(DEVICE)

In [None]:
# loss fn, optimizer, scheduler
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

## Train and Evaluate the Model

In [None]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 100
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

In [None]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [None]:
total_accu = None

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_loader)
    accu_val = evaluate(test_loader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'test accuracy {:8.3f} '.format(epoch, time.time() - epoch_start_time, accu_val))
    print('-' * 59)

-----------------------------------------------------------
| end of epoch   1 | time:  0.62s | test accuracy    0.275 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time:  0.46s | test accuracy    0.298 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time:  0.40s | test accuracy    0.317 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time:  0.41s | test accuracy    0.313 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time:  0.40s | test accuracy    0.316 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   6 | time:  0.45s | test

## Test the Model on Input Text

In [None]:
sentiment_label = {0: "Angry", 1: "Disgust", 2: "Fear", 3: "Happy", 4: "Sad", 5: "Surprise"}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() 

In [None]:
inp_text = "অত্যন্ত মর্মাহত হলাম। "

print(f"This is a {sentiment_label[predict(inp_text, TEXT_PIPELINE)]} tweet")

This is a Angry tweet


## References

In [None]:
# https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html