In [1]:
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from transformers import (
    DistilBertTokenizerFast, DistilBertForSequenceClassification,
    RobertaTokenizerFast, RobertaForSequenceClassification,
    XLNetTokenizer, XLNetForSequenceClassification,
    ElectraTokenizer, ElectraForSequenceClassification,
    DebertaTokenizer, DebertaForSequenceClassification
)
from collections import defaultdict
import numpy as np
import torch.nn.functional as F
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from scipy.stats import mode
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings
from sklearn.exceptions import ConvergenceWarning
import os
warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [2]:
root = os.getcwd()
train_path = os.path.join(root, 'training.csv')
val_path = os.path.join(root, 'validation.csv')
test_path = os.path.join(root, 'test.csv')
unlabeled_path = os.path.join(root, 'chunked_unlabeled_texts.csv')
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)
unlabeled_df = pd.read_csv(unlabeled_path)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [28]:
distilbert_weight = os.path.join(root, 'weight', 'DistilBERT_best_model_state.bin')
roberta_weight = os.path.join(root, 'weight', 'RoBERTa_best_model_state.bin')
xlnet_weight = os.path.join(root, 'weight', 'XLNet_best_model_state.bin')
electra_weight = os.path.join(root, 'weight', 'ELECTRA_best_model_state.bin')
deberta_weight = os.path.join(root, 'weight', 'DeBERTa_best_model_state.bin')

In [7]:
distilbert_weight_finetune = os.path.join(root, 'finetune_unlabel_weight', 'DistilBERT_best_model_finetune.bin')
roberta_weight_finetune = os.path.join(root, 'finetune_unlabel_weight', 'RoBERTa_best_model_finetune.bin')
xlnet_weight_finetune = os.path.join(root, 'finetune_unlabel_weight', 'XLNet_best_model_finetune.bin')
electra_weight_finetune = os.path.join(root, 'finetune_unlabel_weight', 'ELECTRA_best_model_finetune.bin')
deberta_weight_finetune = os.path.join(root, 'finetune_unlabel_weight', 'DeBERTa_best_model_finetune.bin')

In [8]:
models_tokenizers = {
    "DistilBERT": (DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased'),
                   DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3),
                   distilbert_weight_finetune),
    "RoBERTa": (RobertaTokenizerFast.from_pretrained('roberta-base'),
                RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3),
                roberta_weight_finetune),
    "XLNet": (XLNetTokenizer.from_pretrained('xlnet-base-cased'),
              XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=3),
              xlnet_weight_finetune),
    "ELECTRA": (ElectraTokenizer.from_pretrained('google/electra-base-discriminator'),
                ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=3),
                electra_weight_finetune),
    "DeBERTa": (DebertaTokenizer.from_pretrained('microsoft/deberta-base'),
                DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=3),
                deberta_weight_finetune)
}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this mod

In [None]:
train_texts = train_df['texts']
train_labels = train_df['labels']
val_texts = val_df['texts']
val_labels = val_df['labels']


# Finetune LLM

In [8]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train_epoch(model, data_loader, optimizer, device, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        loss = F.cross_entropy(outputs.logits, labels)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            loss = F.cross_entropy(outputs.logits, labels)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

def train_and_predict(model_name, tokenizer, model, train_texts, train_labels, val_texts, val_labels, test_texts):
    print(f"Training and predicting with {model_name}...")
    
    train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_len=32)
    val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_len=32)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    model = model.to(device) 
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
    
    best_accuracy = 0
    for epoch in range(50):  
        train_acc, train_loss = train_epoch(model, train_loader, optimizer, device, len(train_dataset))
        val_acc, val_loss = eval_model(model, val_loader, device, len(val_dataset))
        print(f'Epoch {epoch + 1} - {model_name}: Train Acc {train_acc}, Val Acc {val_acc}')
        
        if val_acc > best_accuracy:
            torch.save(model.state_dict(), f'{model_name}_best_model_state.bin')
            best_accuracy = val_acc
    
    model.load_state_dict(torch.load(f"{model_name}_best_model_state.bin"))
    model.eval()
    
    test_dataset = CustomDataset(test_texts, labels=[0] * len(test_texts), tokenizer=tokenizer, max_len=32)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    
    test_preds = []
    with torch.no_grad():
        for d in test_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            test_preds.extend(preds.cpu().numpy())
    
    test_df[f"{model_name}_predictions"] = test_preds

In [None]:
test_texts = test_df["texts"].tolist()
for model_name, (tokenizer, model) in models_tokenizers.items():
    train_and_predict(model_name, tokenizer, model, train_texts, train_labels, val_texts, val_labels, test_texts)

test_df.to_csv("all_model_predictions.csv", index=False)
print("All model predictions have been saved to 'all_model_predictions.csv'")

# Inference finetuned models

In [12]:
test_df

Unnamed: 0,texts
0,4114 1298 322 76 395 4024 20 2598 3977 20 20 6...
1,4114 1298 1297 674 1838 4116 61 1289 84 1445 4...
2,96 1707 1295 3457 468 2348 804 1846 4134
3,4132 146 1839 3413 3977
4,3352 1001 32 1999 3 74 220 20 1309 390 36 4129
...,...
295,4114 1298 1846 4134 395 395 395
296,2583 3352 1001 20 51 1846 250 2718
297,4114 1298 4114 1298 3907 1101 3403 4119 4116 17
298,337 1838 4116 36 1935


In [None]:
import numpy as np

test_df['texts'] = test_df['texts'].fillna("<EMPTY>")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def get_model_predictions(model, tokenizer, texts, model_name):
    predictions = []
    model.to(device)
    model.eval()
    
    if isinstance(texts, pd.Series):
        texts = texts.astype(str).tolist()  
    
    with torch.no_grad():
        for text in texts:
            if not isinstance(text, str):
                text = str(text)  
            
            encoding = tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.append(preds.item())
    
    return predictions

for model_name, (tokenizer, model, weight_path) in models_tokenizers.items():
    model.load_state_dict(torch.load(weight_path, map_location=device))
    column_name = f"{model_name}_predictions"
    test_df[column_name] = get_model_predictions(model, tokenizer, test_df['texts'], model_name)

test_df['labels'] = test_df[
    [f"{model_name}_predictions" for model_name in models_tokenizers.keys()]
].mode(axis=1)[0]
test_df['labels'] = test_df['labels'].astype(int)


  model.load_state_dict(torch.load(weight_path, map_location=device))
  model.load_state_dict(torch.load(weight_path, map_location=device))
  model.load_state_dict(torch.load(weight_path, map_location=device))
  model.load_state_dict(torch.load(weight_path, map_location=device))
  model.load_state_dict(torch.load(weight_path, map_location=device))
