In [1]:
from datasets import list_datasets, load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm as tqdm
#from pymagnitude import *
from nltk import word_tokenize
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import wandb

In [None]:
from pymagnitude import *

In [2]:
tags = [('tamilmixsentiment', None, 'tamil'), 
        ('offenseval_dravidian', 'tamil', 'tamil'), 
        ('offenseval_dravidian', 'malayalam', 'malayalam'),
        ('offenseval_dravidian', 'kannada', 'kannada'),
        ('kan_hope', None, 'kannada'),
        #('Shushant/NepaliSentiment', None, 'hindi')
        ]

In [3]:
def get_data(tag):
    dataset = load_dataset(tag[0], tag[1])
    train_df = pd.DataFrame()
    val_df = pd.DataFrame()

    train_df['text'] = dataset['train']['text']
    train_df['label'] = dataset['train']['label']
    
    val_df['text'] = dataset['validation']['text']
    val_df['label'] = dataset['validation']['label']
    
#     print("TRAIN DESCRIPTION:")
#     print("Value Counts:")
#     print(train_df['label'].value_counts())
          
#     print("Sample text and label:")
#     for i in range(5):
#         idx = random.randint(0, len(train_df))
#         sample = train_df.iloc[idx]
#         print("Text: {}, Label:{}".format(sample['text'], sample['label']))
    return train_df, val_df

In [4]:
class LinearNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        # define layers
        self.fc1 = nn.Linear(in_features=input_size,out_features=1024)
        self.fc2 = nn.Linear(in_features=1024,out_features=output_size)
    # define forward function
    def forward(self, t):
        # fc 1
        t=self.fc1(t)
        t=F.relu(t)
        # fc 2
        t=self.fc2(t)
        # don't need softmax here since we'll use cross-entropy as activation.
        return t

In [5]:
def get_embedding(text, model, model_type):
    if model_type in ['TfidfVectorizer', 'CountVectorizer']:
        return model.transform(text).toarray()
    elif model_type == 'BERT':
        print("BERT model embedding...")
        return model.encode(text, batch_size=8, show_progress_bar=True)
    elif model_type == 'magnitude':
        vectors = []
        for sentence in tqdm(text):
            vectors.append(np.average(model.query(word_tokenize(sentence)), axis=0))
        return vectors    

In [6]:
class CodemixDataset(Dataset):
    def __init__(self, df, encoder_model, encoding_type):
        self.df = df
        self.embedding = get_embedding(list(self.df['text']), \
                                      encoder_model, encoding_type)
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        text_embedding = self.embedding[idx]
        label = self.df.iloc[idx]['label']
        sample = {
            'text': text_embedding,
            'label': label
        }
        return sample

In [7]:
def get_scores(y_true, y_pred):
    if torch.cuda.is_available():
        accuracy = accuracy_score(y_true.cpu(), y_pred.cpu())
        f1 = f1_score(y_true.cpu(), y_pred.cpu(), average='weighted')
        recall = recall_score(y_true.cpu(), y_pred.cpu(), average='weighted')
        precision = precision_score(y_true.cpu(), y_pred.cpu(), average='weighted')
    else:
        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, average='weighted')
        recall = recall_score(y_true, y_pred, average='weighted')
        precision = precision_score(y_true, y_pred, average='weighted')
    return accuracy, f1, recall, precision

In [8]:
def train(model_architecture, encoder_model, encoding_type, train_df, test_df, epochs, \
          learning_rate, batch_size, log, device):
    input_size = get_embedding([train_df.iloc[0]['text']], encoder_model, encoding_type).shape
    model = model_architecture(input_size[1], len(train_df['label'].unique())).to(device)
    
    train_set = CodemixDataset(train_df, encoder_model, encoding_type)
    val_set = CodemixDataset(val_df, encoder_model, encoding_type)
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, shuffle=False)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss()

    model.train()
    for epoch in range(epochs):
        output_labels = torch.Tensor().to(device)
        true_labels = torch.Tensor().to(device)
        total_loss = 0
        for data in tqdm(train_loader):
            text, labels = data['text'], data['label']
            inputs = text.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs.type(torch.float))
            predicted_labels = torch.argmax(outputs, dim=1)
            #check this
            output_labels = torch.cat((output_labels, predicted_labels), 0)
            true_labels = torch.cat((true_labels, labels), 0)
            loss = criterion(outputs, labels.to(device))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        tr_accuracy, tr_f1, tr_recall, tr_precision = get_scores(true_labels, output_labels)
        net_train_loss = total_loss/(len(train_loader)*batch_size)
        print('Epoch: {}\t Train Loss: {:.4f} \t Train F1:{:.2f}'.format(epoch, net_train_loss, tr_f1), end='\t')
        
        output_labels = torch.Tensor().to(device)
        true_labels = torch.Tensor().to(device)
        total_loss = 0
        for i, data in enumerate(test_loader, 0):
            text, labels = data['text'], data['label']
            
            inputs = text.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs.type(torch.float))
            predicted_labels = torch.argmax(outputs, dim=1)
            output_labels = torch.cat((output_labels, predicted_labels), 0)
            true_labels = torch.cat((true_labels, labels), 0)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
        ts_accuracy, ts_f1, ts_recall, ts_precision = get_scores(true_labels, output_labels)
        net_test_loss = total_loss/(len(train_loader)*batch_size)
        print('Test Loss: {:.4f} \t Test F1:{:.2f}'.format(net_test_loss, ts_f1))

        if log != None:
            log({
                    "train accuracy": tr_accuracy,
                    "train f1": tr_f1,
                    "train recall": tr_recall,
                    "train precision": tr_precision,
                    
                    "test accuarcy": ts_accuracy,
                    "test f1": ts_f1,
                    "test recall": ts_recall,
                    "test precision": ts_precision,
                    
                    "train loss": net_train_loss,
                    "test loss": net_test_loss
                    })

In [11]:
EPOCHS = 10
ETA = 0.001
BATCH_SIZE = 32
tdevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")
run = wandb.init(project="nn-project2", entity="nnproj",reinit=True)

train_df, val_df = get_data(tags[1])
# train_df = pd.read_csv('../dataset/dravidian-codemix/tamil_train.tsv', sep='\t')
# val_df = pd.read_csv('../dataset/dravidian-codemix/tamil_dev.tsv', sep='\t')

# tfidf vec
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,3), max_features=2048)
vectorizer.fit(train_df['text'])
train(LinearNetwork, vectorizer, 'CountVectorizer', train_df, val_df, epochs=EPOCHS, \
      learning_rate=ETA, batch_size=BATCH_SIZE, log=wandb.log, device=tdevice)

# # tfidf vec
# vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,3), max_features=2048)
# vectorizer.fit(train_df['text'])
# train(LinearNetwork, vectorizer, 'TfidfVectorizer', train_df, val_df, epochs=EPOCHS, \
#       learning_rate=ETA, batch_size=BATCH_SIZE, log=None, device=tdevice)

# #fasttext - specific language
# model = Magnitude("../weights/fasttext/{}/{}.magnitude".format(tags[0][2], tags[0][2]))
# train(LinearNetwork, model, 'magnitude', train_df, val_df, epochs=EPOCHS, \
#       learning_rate=ETA, batch_size=BATCH_SIZE, log=None, device=tdevice)

# bert - multilingual
# model = SentenceTransformer('distiluse-base-multilingual-cased')
# train(LinearNetwork, model, 'BERT', train_df, val_df, epochs=EPOCHS, \
#       learning_rate=ETA, batch_size=BATCH_SIZE, log=None, device=tdevice)

run.finish()




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
test accuarcy,▇██▅▃▂▄▁▄▂
test f1,▁█▆█▇▆▆▇▇▆
test loss,▁▁▂▂▃▅▅▆▇█
test precision,▁█▄█▅▃▄▇▅▃
test recall,▇██▅▃▂▄▁▄▂
train accuracy,▁▂▃▄▅▇▇███
train f1,▁▂▃▄▆▇▇███
train loss,█▇▆▅▄▂▂▁▁▁
train precision,▁▂▃▄▆▇▇███
train recall,▁▂▃▄▅▇▇███

0,1
test accuarcy,0.74521
test f1,0.72798
test loss,0.00598
test precision,0.71691
test recall,0.74521
train accuracy,0.98782
train f1,0.9878
train loss,0.0015
train precision,0.98779
train recall,0.98782


Reusing dataset offenseval_dravidian (C:\Users\deepa\.cache\huggingface\datasets\offenseval_dravidian\tamil\1.0.0\caf62757ff7f5922e043f21abf68745096b24007c4b79d5b2344ea3a7238563f)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1099 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 0	 Train Loss: 0.0243 	 Train F1:0.69	Test Loss: 0.0028 	 Test F1:0.72


  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1099 [00:00<?, ?it/s]

Epoch: 1	 Train Loss: 0.0201 	 Train F1:0.74	Test Loss: 0.0029 	 Test F1:0.73


  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1099 [00:00<?, ?it/s]

Epoch: 2	 Train Loss: 0.0173 	 Train F1:0.78	Test Loss: 0.0030 	 Test F1:0.73


  0%|          | 0/1099 [00:00<?, ?it/s]

Epoch: 3	 Train Loss: 0.0135 	 Train F1:0.84	Test Loss: 0.0034 	 Test F1:0.73


  0%|          | 0/1099 [00:00<?, ?it/s]

Epoch: 4	 Train Loss: 0.0095 	 Train F1:0.90	Test Loss: 0.0039 	 Test F1:0.72


  0%|          | 0/1099 [00:00<?, ?it/s]

Epoch: 5	 Train Loss: 0.0064 	 Train F1:0.93	Test Loss: 0.0045 	 Test F1:0.72


  0%|          | 0/1099 [00:00<?, ?it/s]

Epoch: 6	 Train Loss: 0.0044 	 Train F1:0.96	Test Loss: 0.0050 	 Test F1:0.73


  0%|          | 0/1099 [00:00<?, ?it/s]

Epoch: 7	 Train Loss: 0.0032 	 Train F1:0.97	Test Loss: 0.0055 	 Test F1:0.72


  0%|          | 0/1099 [00:00<?, ?it/s]

Epoch: 8	 Train Loss: 0.0027 	 Train F1:0.98	Test Loss: 0.0060 	 Test F1:0.72


  0%|          | 0/1099 [00:00<?, ?it/s]

Epoch: 9	 Train Loss: 0.0023 	 Train F1:0.98	Test Loss: 0.0063 	 Test F1:0.72



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
test accuarcy,█▆▅▅▁▂▃▃▂▂
test f1,▁▆█▅▅▄▆▄▂▂
test loss,▁▁▁▂▃▄▅▆▇█
test precision,▇█▇▃▄▃▅▃▁▂
test recall,█▆▅▅▁▂▃▃▂▂
train accuracy,▁▂▃▄▆▇▇███
train f1,▁▂▃▅▆▇████
train loss,█▇▆▅▃▂▂▁▁▁
train precision,▁▂▃▅▆▇████
train recall,▁▂▃▄▆▇▇███

0,1
test accuarcy,0.74567
test f1,0.71907
test loss,0.00634
test precision,0.70559
test recall,0.74567
train accuracy,0.97817
train f1,0.97801
train loss,0.00229
train precision,0.97803
train recall,0.97817


In [None]:
tdevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tdevice

In [None]:
torch.cuda.is_available()

In [None]:
import torch
print(torch.__version__)

In [20]:
ngram_analyzer = ["char","word"]
ngram_ranges = [(1,3),(2,5)]
ngram_maxfeatures = [1000, 2000, 5000, 10000]
# mlp_layers = [2,4]
mlp_dropout=[0,0.25]
mlp_hiddenNodes = [1024, 2048]

In [None]:
# mlp_config = [
#     {
#         'layers': 2,
#         'dropout':[0,0.25]
#         'hiddenNodes':[1024, 2048]
#     }
    
# ]

In [21]:
for na in ngram_analyzer:
    for nr in ngram_ranges:
        for nmf in ngram_maxfeatures:
            for dp in mlp_dropout:
                for hn in mlp_hiddenNodes:
                    name = na+"_"+str(nr[0])+"-"+str(nr[1])+"_"+str(nmf)+"_2L_"+str(dp)+"D_"+str(hn)
                    print(name)

char_1-3_1000_2L_0D_1024
char_1-3_1000_2L_0D_2048
char_1-3_1000_2L_0.25D_1024
char_1-3_1000_2L_0.25D_2048
char_1-3_2000_2L_0D_1024
char_1-3_2000_2L_0D_2048
char_1-3_2000_2L_0.25D_1024
char_1-3_2000_2L_0.25D_2048
char_1-3_5000_2L_0D_1024
char_1-3_5000_2L_0D_2048
char_1-3_5000_2L_0.25D_1024
char_1-3_5000_2L_0.25D_2048
char_1-3_10000_2L_0D_1024
char_1-3_10000_2L_0D_2048
char_1-3_10000_2L_0.25D_1024
char_1-3_10000_2L_0.25D_2048
char_2-5_1000_2L_0D_1024
char_2-5_1000_2L_0D_2048
char_2-5_1000_2L_0.25D_1024
char_2-5_1000_2L_0.25D_2048
char_2-5_2000_2L_0D_1024
char_2-5_2000_2L_0D_2048
char_2-5_2000_2L_0.25D_1024
char_2-5_2000_2L_0.25D_2048
char_2-5_5000_2L_0D_1024
char_2-5_5000_2L_0D_2048
char_2-5_5000_2L_0.25D_1024
char_2-5_5000_2L_0.25D_2048
char_2-5_10000_2L_0D_1024
char_2-5_10000_2L_0D_2048
char_2-5_10000_2L_0.25D_1024
char_2-5_10000_2L_0.25D_2048
word_1-3_1000_2L_0D_1024
word_1-3_1000_2L_0D_2048
word_1-3_1000_2L_0.25D_1024
word_1-3_1000_2L_0.25D_2048
word_1-3_2000_2L_0D_1024
word_1-3_2000