In [1]:
import pickle
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

Install the necessary Libraries before running the code

In [2]:
import torch
if torch.cuda.is_available():       
    device = torch.device("cuda:0")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name())
else:
    
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 3 GPU(s) available.
We will use the GPU: NVIDIA RTX A5000


In [3]:
import torch
torch.cuda.set_device(2)
import gc 
gc.collect()
torch.cuda.empty_cache()

### Dataset
The dataset can be downloaded from [SemEval-2020 Task 9: Overview of Sentiment Analysis of Code-Mixed Tweets](https://aclanthology.org/2020.semeval-1.100/) (Patwa et al., SemEval 2020)
<br>
Replace the train, validation and test file paths with actual path.

In [4]:
# from datasets import load_dataset

# data_files = {
#     'train': 'ENHINDATA/train.csv',
#     'validation': 'ENHINDATA/validation.csv',
#     'test': 'ENHINDATA/test.csv'
# }
# dataset = load_dataset('csv', data_files=data_files)
# print(dataset)


In [5]:
X_train = []
y_train = []
X_eval = []
y_eval = []

In [6]:
with open('engspa/train.txt',encoding='UTF-8') as rf:
    lines = rf.readlines()
for lin in lines:
    da = lin.split('\t')
    lab = da[1].strip()
    X_train.append(da[0])
    if lab == "neutral":
        y_train.append(0)
    if lab == "positive":
        y_train.append(1)
    if lab == "negative":
        y_train.append(2)

In [7]:
len(X_train)

12194

In [8]:
with open('engspa/validation.txt',encoding='UTF-8') as rf:
    lines = rf.readlines()
for lin in lines:
    da = lin.split('\t')
    lab = da[1].strip()
    X_eval.append(da[0])
    if lab == "neutral":
        y_eval.append(0)
    if lab == "positive":
        y_eval.append(1)
    if lab == "negative":
        y_eval.append(2)

In [9]:
with open('engspa/test.txt',encoding='UTF-8') as rf:
    lines = rf.readlines()
X_test=[]
y_test = []
for lin in lines:
    da = lin.split('\t')
    # lab = da[1].strip()
    X_test.append(da[0])
    # if lab == "neutral":
    #     y_test.append(0)
    # if lab == "positive":
    #     y_test.append(1)
    # if lab == "negative":
    #     y_test.append(2)

In [10]:
len(X_test)

4736

Replace the llm model and llm tokenizer with the desired decoder only llm model.

In [11]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AutoTokenizer
from peft import get_peft_model, LoraConfig

# Model and tokenizer setup
llm_model_name = 'google/gemma-2-2b'
gpttokenizer = AutoTokenizer.from_pretrained(llm_model_name)
gpttokenizer.add_special_tokens({'pad_token': '[PAD]'})
llmmodel = AutoModel.from_pretrained(llm_model_name, trust_remote_code=True)

# LoRA configuration
lora_config = LoraConfig(
    r=64,  # Rank of the low-rank adaptation
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.05,  # Dropout rate
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "down_proj", "gate_proj", "up_proj"] # Target modules for LoRA
)

# Apply LoRA to the model
gptmodel = get_peft_model(llmmodel, lora_config)

# Move the model to the desired GPU
gptmodel.to('cuda:0')

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

PeftModel(
  (base_model): LoraModel(
    (model): Gemma2Model(
      (embed_tokens): Embedding(256000, 2304, padding_idx=0)
      (layers): ModuleList(
        (0-25): 26 x Gemma2DecoderLayer(
          (self_attn): Gemma2SdpaAttention(
            (q_proj): lora.Linear(
              (base_layer): Linear(in_features=2304, out_features=2048, bias=False)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=2304, out_features=64, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=64, out_features=2048, bias=False)
              )
              (lora_embedding_A): ParameterDict()
              (lora_embedding_B): ParameterDict()
            )
            (k_proj): lora.Linear(
              (base_layer): Linear(in_features=2304, out_features=1024, bias=False)
              (lora_dropout)

Replace the bertmodel and tokenizer with the desired encoder model.

In [12]:
from transformers import BertTokenizer, BertModel

berttokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
bertmodel = BertModel.from_pretrained('bert-base-multilingual-uncased')
bertmodel.to('cuda:0')

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(105879, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [13]:
class Classifier(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(hidden_size, 768)  # Combine BERT + Gemma embeddings
        self.fc2 = nn.Linear(768, 256)
        self.fc3 = nn.Linear(256, 64)
        self.fc4 = nn.Linear(64, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        return torch.softmax(x, dim=1)

In [14]:
num_classes = 3  # For example
classifier = Classifier(bertmodel.config.hidden_size + gptmodel.config.hidden_size, num_classes)
classifier.to('cuda:0')

Classifier(
  (fc1): Linear(in_features=3072, out_features=768, bias=True)
  (fc2): Linear(in_features=768, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=3, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
)

In [15]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(list(bertmodel.parameters()) + list(gptmodel.parameters()) + list(classifier.parameters()), lr=5e-4)

## Test & Validation

In [16]:
from sklearn.metrics import f1_score
def predict(bertmodel, gptmodel):
    n = len(X_eval)
    bertmodel.eval()
    gptmodel.eval()
    correct = 0
    y_pred = []
    y_act = []
    for j in tqdm(range(n)):
        inputsgpt = gpttokenizer(X_eval[j], return_tensors="pt")
        inputsbert = berttokenizer(X_eval[j], return_tensors="pt")
        
        with torch.no_grad():
            outputsbert = bertmodel(**(inputsbert.to('cuda:0')))
            outputsgpt = gptmodel(**(inputsgpt.to('cuda:0')))
        bert_hidden_states = outputsbert[0].mean(dim = 1)
        gpt_hidden_states = outputsgpt[0].mean(dim = 1)
        
        
        representation = torch.cat([bert_hidden_states, gpt_hidden_states], dim=1)
        # representation = representation.unsqueeze(1)    
        logits = classifier(representation)
        print(logits)
        y_pred.append(torch.argmax(logits, dim=1).item())
        if torch.argmax(logits, dim=1)==y_eval[j]:
            correct += 1
        y_act.append(y_eval[j])
    print('Validation Accuracy: ', correct/n)
    print('Validation F1 Score: ', f1_score(y_pred,y_act, average='weighted'))

In [17]:

def test(bertmodel, gptmodel):
    n = len(X_test)
    bertmodel.eval()
    gptmodel.eval()
    correct = 0
    y_pred = []
    for j in tqdm(range(n)):
        inputsgpt = gpttokenizer(X_test[j], return_tensors="pt")
        inputsbert = berttokenizer(X_test[j], return_tensors="pt")
        
        with torch.no_grad():
            outputsbert = bertmodel(**(inputsbert.to('cuda:0')))
            outputsgpt = gptmodel(**(inputsgpt.to('cuda:0')))
        bert_hidden_states = outputsbert[0].mean(dim = 1)
        gpt_hidden_states = outputsgpt[0].mean(dim = 1)
        representation = torch.cat([bert_hidden_states, gpt_hidden_states], dim=1)
            
        logits = classifier(representation)
        k = torch.argmax(logits, dim=1)
        y_pred.append(k)
    y_pred = [t.item() for t in y_pred]
    return y_pred
            #correct += 1
    #print('Validation Accuracy: ', correct/n)

## Training stochastic


In [35]:
from tqdm import tqdm
def train(bertmodel, gptmodel):
    for i in range(3):
        running_loss = 0
        n = len(X_train)
        for j in tqdm(range(n)):
            inputsgpt = gpttokenizer(X_train[j], return_tensors="pt")
            
            inputsbert = berttokenizer(X_train[j], return_tensors="pt")

            outputsbert = bertmodel(**(inputsbert.to('cuda:0')))
            bert_hidden_states = outputsbert[0].mean(dim=1)
            outputsgpt = gptmodel(**(inputsgpt.to('cuda:0')))
            
            gpt_hidden_states = outputsgpt[0].mean(dim=1)
           
            representation = torch.cat([bert_hidden_states, gpt_hidden_states], dim=1)
            
            logits = classifier(representation)
            target = torch.tensor(y_train[j]).to('cuda:0')
            target = target.reshape(-1)
            loss = criterion(logits, target)

            running_loss += loss.item()

            # Backpropagate the gradients
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print("------------\nEPOCH :", i )
        predict(bertmodel, gptmodel)
        y_pred = test(bertmodel,gptmodel)
        with open('results_{}.txt'.format(i), 'w') as file:
            for x in y_pred:
                if x == 0:
                    file.write('neutral')
                elif x == 1:
                    file.write('positive')
                else:
                    file.write('negative')
            

## batch gradient descent

In [21]:
import torch
from tqdm import tqdm
from sklearn.metrics import f1_score

batch_size = 2  

def batch_tokenize(tokenizer, texts):
    # Tokenize a list of texts and return tensors on CUDA device
    # print(tokenizer(texts, return_tensors='pt', padding=True, truncation=True))
    return tokenizer(texts, return_tensors='pt', padding=True, truncation=True)

def predict(bertmodel, gptmodel):
    bertmodel.eval()
    gptmodel.eval()

    y_pred = []
    y_act = []

    n = len(X_eval)
    for i in tqdm(range(0, n, batch_size)):
        batch_texts = X_eval[i:i+batch_size]
        batch_labels = y_eval[i:i+batch_size]

        inputsbert = batch_tokenize(berttokenizer, batch_texts)
        inputsgpt = batch_tokenize(gpttokenizer, batch_texts)

        with torch.no_grad():
            outputsbert = bertmodel(**inputsbert)
            outputsgpt = gptmodel(**inputsgpt)

        bert_hidden_states = outputsbert.last_hidden_state.mean(dim=1)  # (batch_size, hidden_dim)
        gpt_hidden_states = outputsgpt.last_hidden_state.mean(dim=1)    # (batch_size, hidden_dim)

        representation = torch.cat([bert_hidden_states, gpt_hidden_states], dim=1)  # (batch_size, 2*hidden_dim)

        logits = classifier(representation)  # (batch_size, num_classes)
        preds = torch.argmax(logits, dim=1).cpu().tolist()

        y_pred.extend(preds)
        y_act.extend(batch_labels)

    correct = sum([p == a for p, a in zip(y_pred, y_act)])
    accuracy = correct / n
    f1 = f1_score(y_act, y_pred, average='weighted')

    print(f'Validation Accuracy: {accuracy:.4f}')
    print(f'Validation F1 Score: {f1:.4f}')


def train(bertmodel, gptmodel):
    bertmodel.train()
    gptmodel.train()

    n = len(X_train)
    for epoch in range(3):
        running_loss = 0.0

        for i in tqdm(range(0, n, batch_size)):
            batch_texts = X_train[i:i+batch_size]
            batch_labels = y_train[i:i+batch_size]
            
            inputsbert = batch_tokenize(berttokenizer, batch_texts)
            inputsgpt = batch_tokenize(gpttokenizer, batch_texts)
           
            outputsbert = bertmodel(**(inputsbert.to('cuda:0')))
            outputsgpt = gptmodel(**(inputsgpt.to('cuda:0')))

            bert_hidden_states = outputsbert.last_hidden_state.mean(dim=1)
            gpt_hidden_states = outputsgpt.last_hidden_state.mean(dim=1)

            representation = torch.cat([bert_hidden_states, gpt_hidden_states], dim=1)

            logits = classifier(representation)

            target = torch.tensor(batch_labels).to('cuda:0')
            loss = criterion(logits, target)

            running_loss += loss.item() * len(batch_labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        avg_loss = running_loss / n
        print(f"------------\nEPOCH : {epoch} | Average Loss: {avg_loss:.4f}")

        predict(bertmodel, gptmodel)

        y_pred = test(bertmodel, gptmodel)
        with open(f'results_{epoch}.txt', 'w') as file:
            for x in y_pred:
                if x == 0:
                    file.write('neutral\n')
                elif x == 1:
                    file.write('positive\n')
                else:
                    file.write('negative\n')


In [22]:
train(bertmodel, gptmodel)

  0%|                                                  | 0/6097 [00:00<?, ?it/s]/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [418,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [418,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [418,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [418,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [418,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [418,0,0], thread: [37,0,0] Assertion `src

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


predict function prints the test F1 score 

In [32]:
 predict(bertmodel, gptmodel)

100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 16.38it/s]

tensor([[1.0000e+00, 6.3382e-16, 1.9429e-16]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
tensor([[1.0000e+00, 1.9566e-18, 3.5129e-19]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
tensor([[1.0000e+00, 3.0373e-16, 4.8170e-16]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
Validation Accuracy:  0.0
Validation F1 Score:  0.0





In [40]:
y_pred = test(bertmodel,gptmodel)

  4%|█▋                                      | 201/4736 [00:10<03:51, 19.57it/s]


KeyboardInterrupt: 

In [None]:
with open("sa_spa_eng", "w") as writer:
        writer.write('\n'.join(y_pred))