In [1]:
import os
import json
import re
import torch
import numpy as np
import pandas as pd
from datetime import datetime
from itertools import chain
from tqdm import tqdm
from tqdm.auto import tqdm as auto_tqdm
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import (
    DebertaV2Tokenizer,
    DebertaV2Model,
    AutoTokenizer,
    AutoModel,
    AutoModelForCausalLM,
    AdamW,
    get_scheduler,
    BitsAndBytesConfig
)
from datasets import load_dataset, DatasetDict
import sentencepiece
import pickle
import random
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader, Dataset
from transformers import DebertaV2Tokenizer, DebertaV2Model
from torch import nn
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AdamW
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import logging
import sys

In [2]:
dataset = load_dataset("lmsys/lmsys-arena-human-preference-55k")
subset_size = int(0.7 * len(dataset['train']))
dataset_sub = dataset['train'].select(range(subset_size))
dataset = DatasetDict({
        "train": dataset_sub,
    })

label2name = {0: 'winner_model_a', 1: 'winner_model_b', 2: 'winner_tie'}
name2label = {v:k for k, v in label2name.items()}

def clean_text(text):
    # Remove Unicode escape sequences
    text = re.sub(r'\\u[0-9a-fA-F]{4}', '', text)
    # Remove non-ASCII characters (including emojis)
    text = re.sub(r'[^*\x00-\x7F]+', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [3]:
model_id = 'microsoft/deberta-v3-base'

tokenizer = DebertaV2Tokenizer.from_pretrained(model_id)

In [4]:
prompts = []
responses_a_arr = []
responses_b_arr = []
data_text_a = []
data_text_b = []
labels = []
sep_token = tokenizer.sep_token

for idx, example in enumerate(tqdm(dataset['train'])):
    
    prompt_arr = json.loads(example['prompt'])
    response_a_arr = json.loads(example['response_a'])
    response_b_arr = json.loads(example['response_b'])
        
    if response_a_arr[-1] is None:
        response_a_arr[-1] = ''
    
#     stra += (clean_text(prompt_arr[-1]) + ' [SEP] ' + response_a_arr[-1])
    '''
    print("Prompt:", prompt_arr[i])
    print("--------------------------")
    print("Response A:", response_a_arr[i])
    '''
    if response_b_arr[-1] is None:
        response_b_arr[-1] = ''
#     strb += (clean_text(prompt_arr[-1]) + ' [SEP] ' + response_b_arr[-1])
    '''
    print("Prompt:", prompt_arr[i])
    print("--------------------------")
    print("Response B:", response_b_arr[i])
    print(example['winner_model_b'])
        '''
    
    clean_prompt = clean_text(prompt_arr[-1])
    clean_a = clean_text(response_a_arr[-1])
    clean_b = clean_text(response_b_arr[-1])
    
    prompts.append(clean_prompt)
    responses_a_arr.append(clean_a)
    responses_b_arr.append(clean_b)
    
    data_text_a.append(clean_prompt + sep_token + clean_a)
    data_text_b.append(clean_prompt + sep_token + clean_b)
    
#     stra = clean_text(stra)
#     strb = clean_text(strb)
#     data_text_a.append(stra)
#     data_text_b.append(strb)
#     stra = ""
#     strb = ""

    if example['winner_model_a'] == 1:
        labels.append(name2label['winner_model_a'])
    elif example['winner_model_b'] == 1:
        labels.append(name2label['winner_model_b'])
    else:
        labels.append(name2label['winner_tie'])

# print(data_text_a[0])

100%|███████████████████████████████████████████████████████████████████| 40233/40233 [00:09<00:00, 4216.73it/s]


In [5]:
data_text_a[0]

'OK, does pineapple belong on a pizza? Relax and give me fun answer.[SEP]Ah, the age-old culinary conundrum that has divided nations and dinner tables: does pineapple belong on a pizza? The tropical twist of pineapple on pizza, known as Hawaiian pizza, is a hotly debated topic where taste buds battle and pizza purists protest. Let\'s slice into the debate with a zest of fun: **Team Pineapple:** "Absolutely, yes! Pineapple on pizza is like a beach party in your mouth. The sweet juiciness of pineapple chunks frolicking with savory ham or bacon creates a flavor wave that surfs across the cheesy ocean of deliciousness. It\'s the Mardi Gras of pizzas, where sweet meets savory in a jubilant jamboree!" **Team No-Pineapple:** "No way, not in a million pizzas! Pineapple is a fruit that should be sunbathing on a tropical fruit platter, not freeloading on a sacred slice of pizza. The thought of warm, melty cheese conspiring with pineapple\'s sugary mischief is enough to make Italian ancestors tur

In [6]:
class TextPairDataset(Dataset):
    def __init__(self, texts1, texts2, labels, tokenizer, max_length):
        self.texts1 = texts1
        self.texts2 = texts2
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text1 = self.texts1[idx]
        text2 = self.texts2[idx]
        label = self.labels[idx]

        encoding1 = self.tokenizer(text1, 
                                  padding='max_length', 
                                  truncation=True, 
                                  max_length=self.max_length, 
                                  return_tensors='pt')

        encoding2 = self.tokenizer(text2, 
                                  padding='max_length', 
                                  truncation=True, 
                                  max_length=self.max_length, 
                                  return_tensors='pt')

        return {
            'input_ids1': encoding1['input_ids'].flatten(),
            'attention_mask1': encoding1['attention_mask'].flatten(),
            'input_ids2': encoding2['input_ids'].flatten(),
            'attention_mask2': encoding2['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [7]:
class DebertaClassifier(nn.Module):
    def __init__(self, model_name, hidden_size, num_classes, drop_out):
        super(DebertaClassifier, self).__init__()
        self.deberta = DebertaV2Model.from_pretrained(model_name)
        
        self.head_layer = nn.Sequential(
            nn.Dropout(drop_out),
            nn.Linear(2*hidden_size, 1*hidden_size),
            nn.Tanh(),
            nn.Dropout(drop_out),
            nn.Linear(1 * hidden_size, num_classes),
        )
#         self.fc = nn.Linear(hidden_size * 2, num_classes)
    
    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
        outputs1 = self.deberta(input_ids=input_ids1, attention_mask=attention_mask1)
        outputs2 = self.deberta(input_ids=input_ids2, attention_mask=attention_mask2)
        
        cls_token_state1 = outputs1.last_hidden_state[:, 0, :]  # Take <s> token (equiv. to [CLS])
        cls_token_state2 = outputs2.last_hidden_state[:, 0, :]  # Take <s> token (equiv. to [CLS])
        
        x = torch.cat((cls_token_state1, cls_token_state2), dim=1)  # Concatenate embeddings
        x = self.head_layer(x)
        return x

In [8]:
def train(model, data_loader, criterion, optimizer, device, scheduler, 
          accumulation_steps, epochs, epoch, writer, best_accuracy):
    model.train()
    optimizer.zero_grad()
    total_batches = len(train_loader)
    eval_frequency = total_batches // 5

    loop = tqdm(data_loader, leave=True)
    cumulative_loss = 0.0
    global_step = 0
    losses = []
    cumulative_step_count = 0  # To count the number of accumulation steps
    running_loss = 0.0

    for i, batch in enumerate(loop):
        input_ids1 = batch['input_ids1'].to(device)
        attention_mask1 = batch['attention_mask1'].to(device)
        input_ids2 = batch['input_ids2'].to(device)
        attention_mask2 = batch['attention_mask2'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids1=input_ids1, attention_mask1=attention_mask1,
                        input_ids2=input_ids2, attention_mask2=attention_mask2)
        loss = criterion(outputs, labels)
        losses.append(loss.item())
        cumulative_loss += loss.item()
        running_loss += loss.item()
        loss = loss / accumulation_steps  # Normalize loss
        loss.backward()

        global_step = epoch * num_train_steps + i
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            #scheduler.step() 
            avg_loss = running_loss / accumulation_steps  
            writer.add_scalar('Loss/BatchSum', avg_loss, global_step) 
            cumulative_avg_loss = cumulative_loss / global_step 
            writer.add_scalar('Loss/Cumulative_Avg', cumulative_avg_loss, global_step)
            writer.add_scalar('Learning_Rate', scheduler.get_last_lr()[0], global_step)

            running_loss = 0.0

        loop.set_description(f'Training Epoch [{epoch + 1}/{epochs}]')
        loop.set_postfix(loss=loss.item() * accumulation_steps)

        writer.add_scalar('Loss/Train', loss.item() * accumulation_steps, global_step)
        if (i + 1) % eval_frequency == 0:
            acc, val_loss = eval_model(model, val_loader, criterion, device, epoch, writer)
            logger.info(f'Epoch {epoch + 1}/{epochs}')
            logger.info(f'Intermediate Validation Accuracy: {acc:.4f}, Validation Loss: {val_loss:.4f}')

            if acc > best_accuracy:
                best_accuracy = acc
                model_save_path = f'deberta_base_{acc:.4f}.pth'
                torch.save(model.state_dict(), model_save_path)
                logger.info(f'Saved best model with accuracy: {acc:.4f}')

    return losses, best_accuracy, global_step

In [9]:
def eval_model(model, data_loader, criterion, device, epoch, writer, global_step):
    model = model.eval()
    losses = []
    correct_predictions = 0

    loop = tqdm(data_loader, leave=True)
    with torch.no_grad():
        for batch in loop:
            input_ids1 = batch['input_ids1'].to(device)
            attention_mask1 = batch['attention_mask1'].to(device)
            input_ids2 = batch['input_ids2'].to(device)
            attention_mask2 = batch['attention_mask2'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids1=input_ids1, attention_mask1=attention_mask1,
                            input_ids2=input_ids2, attention_mask2=attention_mask2)
            loss = criterion(outputs, labels)
            losses.append(loss.item())

            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)

            loop.set_description('Evaluating')
            loop.set_postfix(loss=loss.item())
    
    accuracy = correct_predictions.double() / len(data_loader.dataset)
    avg_loss = sum(losses) / len(losses)
    
    writer.add_scalar('Accuracy/Validation', accuracy, global_step)
    writer.add_scalar('Loss/Validation', avg_loss, global_step)
    
    return accuracy, avg_loss


In [10]:
# Load and prepare the dataset
# df = pd.read_csv('your_dataset.csv')
# train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# texts1_str = [
#     "Hello, how are you? "*100,
#     "Transformers library is great for NLP tasks. "*100,
#     "Let's embed these sentences. "*100
# ]

# texts2_str = [
#     "Bla bla bla "*100,
#     "ChatGPT lets go! "*100,
#     "Idk what to write here "*100
# ]

# label_draft = [0,1,1]
train_size = 30000
test_size = 6000

train_df = pd.DataFrame.from_dict({'text1': data_text_a[:train_size], 'text2': data_text_b[:train_size], 'label': labels[:train_size]})
val_df = pd.DataFrame.from_dict({'text1': data_text_a[-test_size:], 'text2': data_text_b[-test_size:], 'label': labels[-test_size:]})

In [11]:
train_df.head()

Unnamed: 0,text1,text2,label
0,"OK, does pineapple belong on a pizza? Relax an...","OK, does pineapple belong on a pizza? Relax an...",0
1,What is the minimal time to get them? 1 day or...,What is the minimal time to get them? 1 day or...,1
2,explain function calling. how would you call a...,explain function calling. how would you call a...,2
3,How can I create a test set for a very rare ca...,How can I create a test set for a very rare ca...,0
4,What is the best way to travel from Tel-Aviv t...,What is the best way to travel from Tel-Aviv t...,1


In [12]:
max_length = 512  # You can adjust this as needed

train_dataset = TextPairDataset(
    texts1=train_df['text1'].to_numpy(),
    texts2=train_df['text2'].to_numpy(),
    labels=train_df['label'].to_numpy(),
    tokenizer=tokenizer,
    max_length=max_length
)

val_dataset = TextPairDataset(
    texts1=val_df['text1'].to_numpy(),
    texts2=val_df['text2'].to_numpy(),
    labels=val_df['label'].to_numpy(),
    tokenizer=tokenizer,
    max_length=max_length
)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DebertaClassifier(model_name=model_id, hidden_size=768, num_classes=3, drop_out=0.05)
# model = DebertaClassifier(model_name=model_id, hidden_size=1024, num_classes=3, drop_out=0.05)
model = model.to(device)

accumulation_steps = 8
epochs = 5
criterion = nn.CrossEntropyLoss().to(device)
# criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
#scheduler = CosineAnnealingLR(optimizer, T_max=epochs * len(train_loader), eta_min=2e-7)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, min_lr=2e-7)

best_accuracy = 0.0
global_step = 0
lr = 2e-5
wd = 0.01
batch_size = 4

log_dir = os.path.join("runs", f"deberta_base_v5_lr_{2e-5}_wd_{0.01}_bs_{4}_accum_{accumulation_steps}")
log_file_path_template = "deberta_base_v5_lr_{}_wd_{}_bs_{}_accum_{}.txt"
writer = SummaryWriter(log_dir)

log_file_path = log_file_path_template.format(lr, wd, batch_size, accumulation_steps)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s', handlers=[
                logging.FileHandler(log_file_path),
                logging.StreamHandler(sys.stdout)
            ])
logger = logging.getLogger()
logger.info(f"Running experiment with learning rate: {lr}, weight decay: {wd}, batch size: {batch_size}, and accumulation steps: {accumulation_steps}")

for epoch in range(epochs):
    losses, best_accuracy, global_step = train(model, train_loader, criterion, optimizer, device, scheduler, accumulation_steps, epochs, epoch, writer, best_accuracy)
    acc, val_loss = eval_model(model, val_loader, criterion, device, epoch, writer, global_step)
    scheduler.step(val_loss)
    logger.info(f'Epoch {epoch + 1}/{epochs}')
    logger.info(f'Validation Accuracy: {acc:.4f}, Validation Loss: {val_loss:.4f}')

    if acc > best_accuracy:
        best_accuracy = acc
        model_save_path = f'deberta_base_{acc:.4f}.pth'
        torch.save(model.state_dict(), model_save_path)
        logger.info(f'Saved best model with accuracy: {acc:.4f}')

# Save the final model
torch.save(model.state_dict(), 'deberta_base_classifier')
writer.close()

2024-07-10 05:14:06,345 - Running experiment with learning rate: 2e-05, weight decay: 0.01, batch size: 4, and accumulation steps: 8


  0%|                                                                                  | 0/7500 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 