This notebook demonstrates steps of creating a pipeline of model training and testing

<!-- %pip install -q transformers datasets evaluate -->

In [None]:
!git clone https://github.com/oopscompiled/nlp-project.git

In [None]:
%cd nlp-project/

In [None]:
%pip install -q transformers wandb

In [None]:
import random
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import wandb
import re
import ast
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import copy
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts
from transformers import AdamW
import warnings
warnings.filterwarnings('ignore')
# from datasets import Dataset

from src.models import * # main models

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

pd.set_option('display.max_colwidth', 150)

In [None]:
BATCH_SIZE = 64
MAX_LEN = 64  # +2 for ([CLS], [SEP])
DEVICE = torch.device('mps' if torch.backends.mps.is_available() else ('cuda' if torch.cuda.is_available() else 'cpu'))
# VOCAB_SIZE = len(tokenizer)

In [None]:
train = pd.read_csv('/kaggle/working/nlp-project/data/train.csv')
test = pd.read_csv('/kaggle/working/nlp-project/data/test.csv')
validation = pd.read_csv('/kaggle/working/nlp-project/data/valid.csv')

In [None]:
def clean_paraphrase_tokens(text):
    if not text.strip():
        return []

    tokens = ast.literal_eval(text)

    cleaned_tokens = [token for token in tokens if not re.match(r'^(para|phrase).*', token.lower())]

    full_text = " ".join(cleaned_tokens)

    for size in range(len(cleaned_tokens) // 2, 1, -1):
        pattern = " ".join(cleaned_tokens[:size])
        if full_text.count(pattern) > 1:
            full_text = full_text.replace(pattern + " " + pattern, pattern)
    
    final_tokens = full_text.split()

    return final_tokens

In [None]:
train['text'] = train['text'].apply(clean_paraphrase_tokens)
train['token_count'] = [len(sentence) for sentence in train['text']]

In [None]:
train.head()

In [None]:
def crossover(words1, words2):
    if len(words1) < 3 or len(words2) < 3:
        return [" ".join(words1), " ".join(words2)]
    
    split1 = len(words1) // 2
    split2 = len(words2) // 2
    
    new_text1 = " ".join(words1[split1:] + words2[split2:])
    new_text2 = " ".join(words2[:split2] + words1[split1:])
    
    return [new_text1, new_text2]

labels_to_augment = ['sadness','fear', 'surprise']

augmented_data = []

max_rows = train['label'].value_counts(normalize=False).max()

for label in labels_to_augment:

    augment_df = train[train['label'] == label]
    num_rows = len(augment_df)
    rows_to_equality = max_rows - num_rows
    num_pairs = max(int(num_rows * 0.14), rows_to_equality)

    for row in range(num_pairs):
        text1 = augment_df.iloc[row]['text']
        text2 = augment_df.iloc[row + 1]['text']
        new_texts = crossover(text1, text2)
        for new_text in new_texts:
            augmented_data.append({'label': label, 'text': new_text})

augmented_words = pd.DataFrame(augmented_data)

In [None]:
augmented_words

In [None]:
train = pd.concat([train, augmented_words], ignore_index=True)

In [None]:
train = train[train['token_count'] > 1]

In [None]:
train['label'].value_counts(normalize=True).round(4)

In [None]:
emotion_types = train['label'].unique().tolist()

label_encoder = LabelEncoder()
label_encoder.fit(emotion_types)

train['label'] = label_encoder.transform(train['label'])
validation['label'] = label_encoder.transform(validation['label'])
test['label'] = label_encoder.transform(test['label'])

In [None]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        text = str(self.texts[index])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        if self.labels is not None:
            label = self.labels[index]
            return {
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'labels': torch.tensor(label, dtype=torch.long)
            }
        else:
            return {
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
            }

    def __len__(self):
        return len(self.texts)

In [None]:
train.shape

In [None]:
print(f"Max len = {np.max(train['token_count'])}\nMin len = {np.min(train['token_count'])}\nAvg len = {np.round(np.mean(train['token_count']), 2)}")

 Токенизация
RoBERTa: BPE-токенизатор лучше обрабатывает редкие слова и сложные языковые конструкции, что полезно для эмоциональных текстов.


In [None]:
from transformers import AutoModel, AutoTokenizer

model_name = "microsoft/deberta-base"

bert = AutoModel.from_pretrained(model_name).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
train_dataset = EmotionDataset(texts=train['text'].tolist(), labels=train['label'].tolist(), tokenizer=tokenizer, max_len=MAX_LEN)
test_dataset = EmotionDataset(texts=test['text'].tolist(),labels=test['label'].tolist(), tokenizer=tokenizer, max_len=MAX_LEN)
validation_dataset = EmotionDataset(texts=validation['text'].tolist(), labels=validation['label'].tolist(), tokenizer=tokenizer, max_len=MAX_LEN)


train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
validation_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=False)

For regularization, we employ two commonly used
techniques: dropout (Hinton et al., 2012) and L2
weight regularization. We apply dropout to prevent co-adaptation. In our model, we either apply
dropout to word vectors before feeding the sequence
of words into the convolutional layer or to the output
of LSTM before the softmax layer. The L2 regularization is applied to the weight of the softmax layer. (https://arxiv.org/pdf/1511.08630)

In [None]:
class_weights = compute_class_weight('balanced', classes=np.unique(train.label), y=train.label)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)

In [None]:
# class_weights[3] *= 1.3
class_weights[5] *= 1.5

In [None]:
class_weights

In [None]:
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
gru_model = MyGRU(embedding_dim=128, hidden_dim=128, output_dim=6, num_layers=2, dropout=0.4,fc_dropout=0.3, bidirectional=True, input_dropout=0.3).to(DEVICE)

gru_optimizer =optim.Adam([
    {'params': bert.parameters(), 'lr': 2e-5}, #2e-5
    {'params': gru_model.parameters(), 'lr': 0.0002}
], weight_decay=0.02)

gru_scheduler = ReduceLROnPlateau(gru_optimizer, patience=5, factor=0.5, verbose=True)
# gru_scheduler = CosineAnnealingWarmRestarts(gru_optimizer, T_0=5, T_mult=2)

In [None]:
lstm_model = MyLSTM(embedding_dim=128, hidden_dim=128, output_dim=6, num_layers=2, dropout=0.5, bidirectional=True,fc_dropout=0.3, input_dropout=0.2).to(DEVICE)
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=2e-4, weight_decay=0.0001) # lr=2e-5 may be optimal for bert 
lstm_scheduler = ReduceLROnPlateau(lstm_optimizer, patience=3, factor=0.5)

In [None]:
hybrid_model = HybridNN().to(DEVICE)
hybrid_optimizer = AdamW([
    {'params': bert.parameters(), 'lr': 1e-5},
    {'params': hybrid_model.parameters(), 'lr': 0.00005}
], weight_decay=0.05)

# hybrid_scheduler = ReduceLROnPlateau(hybrid_optimizer, patience=3, factor=1e-2)
hybrid_scheduler = CosineAnnealingWarmRestarts(hybrid_optimizer, T_0=3, T_mult=2)

In [None]:
bert.config.dropout = 0.3

# for m in bert.modules():
#   for name, params in m.named_parameters():
#     print(name, params.requires_grad)

# freeze layers
# for param in bert.parameters():
#     param.requires_grad = False

# unfreeze last 2 layers
# for layer in bert.encoder.layer[-2:]:
#     for param in layer.parameters():
#         param.requires_grad = True

In [None]:
EPOCHS = 10
MODEL = hybrid_model
OPTIMIZER = hybrid_optimizer
SCHEDULER = hybrid_scheduler

lr_history = []
early_stopper = EarlyStopper(patience=3, models=[bert, MODEL], min_delta=0.001, save_weights=False)
loss_history = []

for epoch in range(EPOCHS):

    bert.train()
    MODEL.train()
    running_loss = 0.0

    for batch in train_loader:
        OPTIMIZER.zero_grad()

        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        bert_output = bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = bert_output.last_hidden_state  # [batch_size, seq_len, 768]

        outputs = MODEL(embeddings)
        loss = loss_fn(outputs, labels)

        loss.backward()
        OPTIMIZER.step()

        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)

    bert.eval()
    MODEL.eval()
    val_preds = []
    val_labels = []
    val_loss_total = 0.0

    with torch.no_grad():
        for batch in validation_loader:

            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            bert_output = bert(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = bert_output.last_hidden_state

            outputs = MODEL(embeddings)
            loss = loss_fn(outputs, labels)

            val_loss_total += loss.item()

            preds = torch.argmax(outputs, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss_total / len(validation_loader)
    loss_history.append(avg_val_loss)
    val_acc = accuracy_score(val_labels, val_preds)

    SCHEDULER.step(avg_val_loss)
    current_lr = OPTIMIZER.param_groups[0]['lr']
    lr_history.append(current_lr)

    if early_stopper.early_stop(avg_val_loss):
        print(f"Early stopping at epoch {epoch + 1}")
        break

    print(f"Epoch {epoch + 1}/{EPOCHS} — Train Loss: {avg_train_loss:.4f} | "
          f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")



class_report = classification_report(val_labels, val_preds, target_names=[str(i) for i in range(6)])
print(f"Classification Report:\n{class_report}")

print("Training finished")

In [None]:
if early_stopper.best_weights is not None:
    bert.load_state_dict(early_stopper.best_weights[0])
    MODEL.load_state_dict(early_stopper.best_weights[1])
    print("Best weights loaded after training")
else:
    print("No best weights were saved")

MODEL.eval()
bert.eval()

test_preds = []
test_labels = []
test_loss_total = 0.0

with torch.no_grad():

    for batch in test_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        bert_output = bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = bert_output.last_hidden_state

        outputs = MODEL(embeddings)
        loss = loss_fn(outputs, labels)
        test_loss_total += loss.item()


        preds = torch.argmax(outputs, dim=1)

        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())



avg_test_loss = test_loss_total / len(test_loader)
test_acc = accuracy_score(test_labels, test_preds)

print(f"Validation Loss: {avg_test_loss:.4f} | Validation Accuracy: {test_acc:.4f}")

class_report = classification_report(test_labels, test_preds, target_names=[str(i) for i in range(6)])

print(f"Classification Report:\n{class_report}")

In [None]:
plt.figure(figsize=(8, 5))

plt.plot(range(1, len(lr_history) + 1), lr_history)
plt.xlabel("Epoch")
plt.ylabel("Learning Rate")
plt.title("Learning Rate Schedule")
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(8, 5))

plt.plot(lr_history, loss_history, marker='o')
plt.xlabel("Learning Rate")
plt.ylabel("Validation Loss")
plt.title("Validation Loss vs Learning Rate")
plt.grid(True)

plt.show()