In [1]:
# place necessary imports here
import random
import pandas as pd
import wandb
import torch
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from pymagnitude import Magnitude
import os
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from tqdm import tqdm
import numpy as np
from nltk import word_tokenize
from sentence_transformers import SentenceTransformer
from datasets import list_datasets, load_dataset

# Evaluation of Language Models : Hands-on
In this hands-on you will learn to perform sentence classification/sentiment analysis on a dataset using different Language models and compare their performance.  

# Dataset description
The datasets we will be using are mutlilingual or codemixed datasets. Each of the datasets are English mixed with a Dravidian language. In the sense that the dataset contains sample sentences which are either written completely in a dravidian language script or using the English alphabet and may contain English words. We will be using 5 datasets: 2 datasets in Tamil-English, 2 datasets in Kannada-English and 1 dataset in Malayalam-English. The dataset has class imbalance problems as a result of which we will compare the performance of the Language models using F1 scores. The samples are mostly taken from Youtube comments sections and hence correspond to real world conversations. More information about the datasets can be seen from the sources listed below. <br>
Sources: <br>
<ol>
<li>https://huggingface.co/datasets/tamilmixsentiment
<li>https://huggingface.co/datasets/offenseval_dravidian
<li>https://huggingface.co/datasets/kan_hope
</ol>

In [2]:
def get_data(tag):
    dataset = load_dataset(tag[0], tag[1])
    train_df = pd.DataFrame()
    val_df = pd.DataFrame()

    train_df['text'] = dataset['train']['text']
    train_df['label'] = dataset['train']['label']
    
    if tag[0] == 'kan_hope':
        val_df['text'] = dataset['test']['text']
        val_df['label'] = dataset['test']['label']
    else:
        val_df['text'] = dataset['validation']['text']
        val_df['label'] = dataset['validation']['label']

    if tag[0] == 'offenseval_dravidian' and tag[2] == 'ml':
        train_df['label'].replace(5, 4, inplace=True)
        val_df['label'].replace(5, 4, inplace=True)
    
    return train_df[:], val_df[:]

In [3]:
tags = [
        ('tamilmixsentiment', None, 'ta'), 
        ('offenseval_dravidian', 'tamil', 'ta'), 
        ('offenseval_dravidian', 'malayalam', 'ml'),
        ('offenseval_dravidian', 'kannada', 'kn'),
        ('kan_hope', None, 'kn')
        ]

tag_dict = {
    'tamilmixsentiment': tags[0],
    'offenseval_dravidian_ta': tags[1],
    'offenseval_dravidian_ml': tags[2],
    'offenseval_dravidian_kn': tags[3],
    'kan_hope': tags[4]
}

### Samples from datasets

In [15]:
BOLD = '\033[1m'
END = '\033[0m'
for tag in tags:
    train_df, val_df = get_data(tag)
    print(train_df.head())
    print('{}Dataset: {}, Language: {}{}'.format(BOLD,tag[0],tag[2],END))


Using custom data configuration default
Reusing dataset tamilmixsentiment (/users/PAS0536/deepaksuresh94/.cache/huggingface/datasets/tamilmixsentiment/default/0.0.0/887420eecaf868ac6c10990649e49d10467e4cd4dffb98a6f20e4fe7c58df390)


  0%|          | 0/3 [00:00<?, ?it/s]

                                               text  label
0          Trailer late ah parthavanga like podunga      0
1        Move pathutu vanthu trailer pakurvnga yaru      0
2          Puthupetai dhanush  ah yarellam pathinga      0
3  Dhanush oda character ,puthu sa erukay , mass ta      0
4  vera level ippa pesungada mokka nu thalaivaaaaaa      0
[1mDataset: tamilmixsentiment, Language: ta[0m


Reusing dataset offenseval_dravidian (/users/PAS0536/deepaksuresh94/.cache/huggingface/datasets/offenseval_dravidian/tamil/1.0.0/caf62757ff7f5922e043f21abf68745096b24007c4b79d5b2344ea3a7238563f)


  0%|          | 0/2 [00:00<?, ?it/s]

                                                text  label
0                  movie vara level la Erika poguthu      0
1  I love Ajith Kumar Vivegam movie inki mjy bht ...      5
2          Padam nalla comedy padama irukum polaye..      0
3  karthick subburaj anne .... intha padam vetri ...      0
4  கவுண்டர் தேவர்.சார்பாக வெற்றி பெற வாழ்த்துக்கள் 🦁      0
[1mDataset: offenseval_dravidian, Language: ta[0m


Reusing dataset offenseval_dravidian (/users/PAS0536/deepaksuresh94/.cache/huggingface/datasets/offenseval_dravidian/malayalam/1.0.0/caf62757ff7f5922e043f21abf68745096b24007c4b79d5b2344ea3a7238563f)


  0%|          | 0/2 [00:00<?, ?it/s]

                                                text  label
0  പലദേശം. പല ഭാഷ ഒരേ ഒരു രാജാവ്  അല്ലാതെ  സ്വന്ത...      0
1  ഈ ഓണം ഏട്ടനും പിള്ളേർക്ക് ഉള്ളതാണ് എന്ന് ഉള്ളവ...      0
2  ആരണ്ട ആരണ്ട തലുണ്ടാകാണാ ആരണ്ട ഞാൻ ആണ്ട ഞാൻ ആണ്...      0
3          Sushin syam  Shaiju khalid  Midhun manual      0
4                          J A K E S.   B EJ O Y !!!      0
[1mDataset: offenseval_dravidian, Language: ml[0m


Reusing dataset offenseval_dravidian (/users/PAS0536/deepaksuresh94/.cache/huggingface/datasets/offenseval_dravidian/kannada/1.0.0/caf62757ff7f5922e043f21abf68745096b24007c4b79d5b2344ea3a7238563f)


  0%|          | 0/2 [00:00<?, ?it/s]

                                                text  label
0  Tik tok alli jagala madtidralla adra baggenu o...      0
1                            Anyone from kerala here      5
2                          Movie rerelease madi plss      0
3  Amazon prime alli bittidira....yella manele no...      0
4  Guru sure news nanu tik tok dawn lod madeda ya...      0
[1mDataset: offenseval_dravidian, Language: kn[0m


Using custom data configuration default
Reusing dataset kan_hope (/users/PAS0536/deepaksuresh94/.cache/huggingface/datasets/kan_hope/default/0.0.0/3ded8b2dea549473aa58db03694800c10b5b5e29c3206385f5044dcb6338ebc3)


  0%|          | 0/2 [00:00<?, ?it/s]

                                                text  label
0                         Valle story iratte maathra      0
1                             @10 R report madi avna      0
2              ಕಿಚ್ಚನ ಹುಡುಗ್ರು ವತಿಯಿಂದ  all the best      0
3  Diya thumba chennagide ondu olle prayathna mov...      1
4                         ಇದು ಚರಿತ್ರೆ ಸೃಷ್ಟಿಸೋ ಅವತಾರ      1
[1mDataset: kan_hope, Language: kn[0m


# Feed-Forward Network - Fine Tuning 
Training of these Language models from scratch on the given datasets takes considerable amount of time. In order to save time, one can opt to fine tune a pre-trained language model to train on new unseen data and unseen labels. One method of Fine tuning involves using a layer of Feed-Forward network which takes the embedding as the input and outputs the target labels. The Feed forward network is essentially a Multi-Layer Perceptron Layer. We will implement this using Pytorch.

More information about the concept and merits of Fine Tuning can be found here: https://www.analyticsvidhya.com/blog/2020/07/transfer-learning-for-nlp-fine-tuning-bert-for-text-classification/

In [16]:
class LinearNetwork_2Layer(nn.Module):
    def __init__(self, input_size, output_size,hn,p):
        super().__init__()
        # define layers
        self.fc1 = nn.Linear(in_features=input_size,out_features=hn)
        self.fc2 = nn.Linear(in_features=hn,out_features=output_size)
        self.dropout = nn.Dropout(p)
    # define forward function
    def forward(self, t):
        # fc 1
        t=self.fc1(t)
        t=F.relu(t)
        t = self.dropout(t)
        # fc 2
        t=self.fc2(t)
        # don't need softmax here since we'll use cross-entropy as activation.
        return t

class LinearNetwork_4Layer(nn.Module):
    def __init__(self, input_size, output_size,hn,p):
        hn1, hn2, hn3 = hn
        super().__init__()
        # define layers
        self.fc1 = nn.Linear(in_features=input_size,out_features=hn1)
        self.fc2 = nn.Linear(in_features=hn1,out_features=hn2)
        self.fc3 = nn.Linear(in_features=hn2,out_features=hn3)
        self.fc4 = nn.Linear(in_features=hn3,out_features=output_size)
        self.dropout = nn.Dropout(p)
    # define forward function
    def forward(self, t):
        # fc 1
        t=self.fc1(t)
        t=F.relu(t)
        t = self.dropout(t)
        # fc 2
        t=self.fc2(t)
        t=F.relu(t)
        t = self.dropout(t)
         # fc 3
        t=self.fc3(t)
        t=F.relu(t)
        t = self.dropout(t)
         # fc 4
        t=self.fc4(t)
        # don't need softmax here since we'll use cross-entropy as activation.
        return t

# Language models: Data embedding and Data loading
The Language models that we will evaluate are the following:
<ol>
<li>TF-IDF Vectorizer
<li>Count Vectorizer
<li>BERT - https://arxiv.org/pdf/1810.04805.pdf
<li>Word2vec - https://arxiv.org/pdf/1301.3781.pdf
<li>GloVe - https://nlp.stanford.edu/pubs/glove.pdf
<li>Fasttext - https://arxiv.org/pdf/1607.04606.pdf
</ol>
More information about the models can be found in the referenced papers given next to each of them.

The function get_embedding() outputs the embedding of the sentences in our datasets using the pretrained models. We will make use of pre-trained models followed by a fine-tuning layer. The pymagnitude library is used to load the pretrained weights and get the embeddings for Word2vec, GloVe and Fasttext. As the get_embedding process takes considerable amount of time and is going to remain constant for each of the language models used, we have saved the embeddings for train and test data of the datasets and can be loaded directly using load_embedding(). The get_embedding() provided here serves as a reference.  

CodemixDataset is a custom Dataset class implementing 3 functions: init, len and getitem. The init function run during the instantiating of the Dataset object calls get_embedding() or load_embedding() function to prepare the embeddings on our Dataset that we will be using. The len function returns the number of samples in our Dataset. The getitem function loads and returns a sample from the dataset at the given index idx. The CodemixDataset retrieves our dataset’s features and labels one sample at a time. We will load CodemixDataset into the Dataloader to iterate through our dataset. 


In [None]:
def get_embedding(text, model, model_type):
    if model_type in ['TfidfVectorizer', 'CountVectorizer']:
        return model.transform(text).toarray()
    elif model_type == 'BERT':
        print("BERT model embedding...")
        return model.encode(text, batch_size=8, show_progress_bar=True)
    elif model_type == 'magnitude':
        vectors = []
        for sentence in tqdm(text):
            vectors.append(np.average(model.query(word_tokenize(sentence)), axis=0))
        return vectors  

def load_embedding(dataset_name, model_name, mode):
    if model_name in ['TfidfVectorizer', 'CountVectorizer','BERT']:
        filename = 'embeddings/'+dataset_name+'/'+model_name+'/'+mode+'Embedding_'+dataset_name+'_'+model_name+'.pt'
    else:
        filename = 'embeddings/'+dataset_name+'/'+model_name+'/'+mode+'Embedding_'+dataset_name+'_magnitude.pt'
    embedding = torch.load(filename)
    return embedding
    
class CodemixDataset(Dataset):
    def __init__(self, df, encoder_model, encoding_type,dataset_name,model_name,mode,load=True):
        self.df = df
        print("Creating Embedding... Will take some time...")
        if load:
            self.embedding = load_embedding(dataset_name,model_name,mode)
        else:            
            self.embedding = get_embedding(list(self.df['text']), \
                                          encoder_model, encoding_type)
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        text_embedding = self.embedding[idx]
        label = self.df.iloc[idx]['label']
        sample = {
            'text': text_embedding,
            'label': label
        }
        return sample


# Training
Training is primarily training the fine tuning layer which is the MLP implemented using Pytorch. The final run consists of 
<ol>
<li>Loading the dataset 
<li>Getting the embeddings
<li>Train the MLP using Train data
<li>Predict the labels of Test data
<li>Measure F1 score on Test data
</ol>

In [None]:
def train(model_architecture, encoder_model, encoding_type, train_df, test_df, epochs, \
          learning_rate, batch_size, log, device, hiddenNodes, dropout, train_loader, test_loader):

    encoding = get_embedding([train_df.iloc[0]['text']], encoder_model, encoding_type)[0]
    input_size = encoding.shape
    model = model_architecture(input_size[0], len(train_df['label'].unique()),hiddenNodes,dropout).to(device)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss()

    print("model architecture:")
    print(model)

    print("train_df classes")
    print(train_df['label'].value_counts())

    model.train()
    for epoch in range(epochs):
        output_labels = torch.Tensor().to(device)
        true_labels = torch.Tensor().to(device)
        total_loss = 0
        for data in tqdm(train_loader):
            text, labels = data['text'], data['label']
            inputs = text.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs.type(torch.float))
            predicted_labels = torch.argmax(outputs, dim=1)
            output_labels = torch.cat((output_labels, predicted_labels), 0)
            true_labels = torch.cat((true_labels, labels), 0)
            loss = criterion(outputs, labels.to(device))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        tr_accuracy, tr_f1, tr_recall, tr_precision = get_scores(true_labels, output_labels)
        net_train_loss = total_loss/(len(train_loader)*batch_size)
        print('Epoch: {}\t Train Loss: {:.4f} \t Train F1:{:.2f}'.format(epoch, net_train_loss, tr_f1), end='\t')
        
        output_labels = torch.Tensor().to(device)
        true_labels = torch.Tensor().to(device)
        total_loss = 0
        for i, data in enumerate(test_loader, 0):
            text, labels = data['text'], data['label']
            
            inputs = text.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs.type(torch.float))
            predicted_labels = torch.argmax(outputs, dim=1)
            output_labels = torch.cat((output_labels, predicted_labels), 0)
            true_labels = torch.cat((true_labels, labels), 0)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
        ts_accuracy, ts_f1, ts_recall, ts_precision = get_scores(true_labels, output_labels)
        net_test_loss = total_loss/(len(train_loader)*batch_size)
        print('Test Loss: {:.4f} \t Test F1:{:.2f}'.format(net_test_loss, ts_f1))
        
        if log != None:
            log({
                    "train accuracy": tr_accuracy,
                    "train f1": tr_f1,
                    "train recall": tr_recall,
                    "train precision": tr_precision,
                    
                    "test accuarcy": ts_accuracy,
                    "test f1": ts_f1,
                    "test recall": ts_recall,
                    "test precision": ts_precision,
                    
                    "train loss": net_train_loss,
                    "test loss": net_test_loss
                    })


In [None]:
def get_scores(y_true, y_pred):
    if torch.cuda.is_available():
        accuracy = accuracy_score(y_true.cpu(), y_pred.cpu())
        f1 = f1_score(y_true.cpu(), y_pred.cpu(), average='weighted')
        recall = recall_score(y_true.cpu(), y_pred.cpu(), average='weighted')
        precision = precision_score(y_true.cpu(), y_pred.cpu(), average='weighted')
    else:
        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, average='weighted')
        recall = recall_score(y_true, y_pred, average='weighted')
        precision = precision_score(y_true, y_pred, average='weighted')
    return accuracy, f1, recall, precision

In [27]:
config_dict = {
    'tamilmixsentiment': {
        'CountVectorizer': {
            'max_features': 5000,
            'analyzer': 'word',
            'range': (2,5),
            'mlp_layers': (512,256,128),
            'mlp_dropout': 0.25
        },
        'TfidfVectorizer': {
            'max_features': 2000,
            'analyzer': 'char',
            'range': (2,5),
            'mlp_layers': (512,256,128),
            'mlp_dropout': 0.25
        },
        'BERT': {
            'mlp_layers': 1024,
            'mlp_dropout': 0
        },
        'glove': {
            'mlp_layers': (1024,512,256),
            'mlp_dropout': 0
        },
        'word2vec': {
            'mlp_layers': 2048,
            'mlp_dropout': 0
        },
        'fasttext': {
            'mlp_layers': 2048,
            'mlp_dropout': 0
        },
    },
    'offenseval_dravidian_ta': {
        'CountVectorizer': {
            'max_features': 10000,
            'analyzer': 'char',
            'range': (1,3),
            'mlp_layers': 2048,
            'mlp_dropout': 0
        },
        'TfidfVectorizer': {
            'max_features': 10000,
            'analyzer': 'char',
            'range': (1,3),
            'mlp_layers': 2048,
            'mlp_dropout': 0
        },
        'BERT': {
            'mlp_layers': 1024,
            'mlp_dropout': 0
        },
        'glove': {
            'mlp_layers': 1024,
            'mlp_dropout': 0
        },
        'word2vec': {
            'mlp_layers': 1024,
            'mlp_dropout': 0
        },
        'fasttext': {
            'mlp_layers': 1024,
            'mlp_dropout': 0
        },
    },
    'offenseval_dravidian_ml': {
        'CountVectorizer': {
            'max_features': 5000,
            'analyzer': 'word',
            'range': (2,5),
            'mlp_layers': (1024,512,256),
            'mlp_dropout': 0.25
        },
        'TfidfVectorizer': {
            'max_features': 10000,
            'analyzer': 'word',
            'range': (2,5),
            'mlp_layers': 2048,
            'mlp_dropout': 0
        },
        'BERT': {
            'mlp_layers': (1024,512,256),
            'mlp_dropout': 0
        },
        'glove': {
            'mlp_layers': (512,256,128),
            'mlp_dropout': 0
        },
        'word2vec': {
            'mlp_layers': 2048,
            'mlp_dropout': 0
        },
        'fasttext': {
            'mlp_layers': 1024,
            'mlp_dropout': 0
        },
    },
    'offenseval_dravidian_kn': {
        'CountVectorizer': {
            'max_features': 2000,
            'analyzer': 'char',
            'range': (2,5),
            'mlp_layers': 2048,
            'mlp_dropout': 0
        },
        'TfidfVectorizer': {
            'max_features': 5000,
            'analyzer': 'word',
            'range': (2,5),
            'mlp_layers': (1024,512,256),
            'mlp_dropout': 0.25
        },
        'BERT': {
            'mlp_layers': 2048,
            'mlp_dropout': 0.25
        },
        'glove': {
            'mlp_layers': 1024,
            'mlp_dropout': 0.25
        },
        'word2vec': {
            'mlp_layers': (1024,512,256),
            'mlp_dropout': 0.25
        },
        'fasttext': {
            'mlp_layers': (1024,512,256),
            'mlp_dropout': 0.25
        },
    },
    'kan_hope': {
        'CountVectorizer': {
            'max_features': 1000,
            'analyzer': 'char',
            'range': (1,3),
            'mlp_layers': 2048,
            'mlp_dropout': 0.25
        },
        'TfidfVectorizer': {
            'max_features': 10000,
            'analyzer': 'word',
            'range': (2,5),
            'mlp_layers': (1024,512,256),
            'mlp_dropout': 0
        },
        'BERT': {
            'mlp_layers': (1024,512,256),
            'mlp_dropout': 0
        },
        'glove': {
            'mlp_layers': 2048,
            'mlp_dropout': 0
        },
        'word2vec': {
            'mlp_layers': 1024,
            'mlp_dropout': 0.25
        },
        'fasttext': {
            'mlp_layers': 1024,
            'mlp_dropout': 0.25
        }
    }
}

In [None]:
EPOCHS = 50
ETA = 0.001
BATCH_SIZE = 64
tdevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")

weight_path=''

load_embedding = True
for dataset_name, tag in tag_dict.items():
    train_df, val_df = get_data(tag)
    
    for model_name, config in config_dict[dataset_name].items():
        if !load_embedding:
            #change embedding model
            encoder_model = None
            if model_type == 'magnitude':
                english_path = os.path.join(arguments['weights_path'], 'english', model_name+'.magnitude')
                dravidian_path = os.path.join(arguments['weights_path'], 'dravidian', tag[2], model_name+'.magnitude')

                english_model = Magnitude(english_path)
                dravidian_model = Magnitude(dravidian_path)
                encoder_model = Magnitude(english_model, dravidian_model, devices=[0,1])
            elif model_type == 'BERT':
                encoder_model = SentenceTransformer('distiluse-base-multilingual-cased')
            elif model_type == 'TfidfVectorizer':
                encoder_model = TfidfVectorizer(analyzer=arguments['analyzer'], ngram_range=arguments['range'], max_features=arguments['max_features'])
                encoder_model.fit(train_df['text'])
            elif model_type == 'CountVectorizer':
                encoder_model = CountVectorizer(analyzer=arguments['analyzer'], ngram_range=arguments['range'], max_features=arguments['max_features'])
                encoder_model.fit(train_df['text'])
        
        train_set = CodemixDataset(train_df, encoder_model, arguments['type'],dataset_name,mode='train')
        val_set = CodemixDataset(val_df, encoder_model, arguments['type'],dataset_name,mode='val')
        train_loader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
        test_loader = torch.utils.data.DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)

        for hiddenNode_idx, num_layers in enumerate([2,4]):
            mlp_hiddenNodes = mlp_hiddenNodes_combinations[hiddenNode_idx]
            for dp in mlp_dropout: #Dropout
                for hn in mlp_hiddenNodes: #hidden nodes

                    fname = arguments['model_name']+"_"+tag[0]+"_"+tag[2]+"_"+str(num_layers)+"L_"+str(dp)+"D_"+str(hn)
                    print(fname)

                    if num_layers == 2:
                        LinearNetwork = LinearNetwork_2Layer
                    elif num_layers == 4:
                        LinearNetwork = LinearNetwork_4Layer

                    #change project name    #entity - nnproj
                    run = wandb.init(project=dataset_name, entity="nnproj",reinit=True,name=fname)

                    train(LinearNetwork, encoder_model, arguments['type'], train_df, val_df, epochs=EPOCHS, \
                        learning_rate=ETA, batch_size=BATCH_SIZE, log=wandb.log, device=tdevice,hiddenNodes=hn,dropout=dp, train_loader=train_loader, test_loader=test_loader)            

                    run.finish()

# Observation and results