# Kaggle competition

New Data available, in different format so we need to repreprocess everything

# 0 - Imports

In [2]:
import pandas as pd
import numpy as np
import torch

# I - Data Preprocessing

In [19]:
df_train = pd.read_csv('train_submission.csv')
df_train.dropna(subset=['Label'], inplace=True)
labels_with_multiple_rows = df_train['Label'].value_counts()
labels_to_keep = labels_with_multiple_rows[labels_with_multiple_rows > 1].index
df_train = df_train[df_train['Label'].isin(labels_to_keep)]
df_train.head(2)

Unnamed: 0,ID,Usage,Text,Label
0,136,Public,Finalment Atena le recibe en l'acropoli d'Ate...,arg
1,62,Public,Jane Laffort fille de Joseph Laffort et d' Ang...,lat


In [20]:
df_test = pd.read_csv('test_without_labels.csv')
df_test.head(2)

Unnamed: 0,ID,Usage,Text
0,55,Private,Ponovo dobija riječni oblik do Drežnice.
1,71,Private,Se formaron aproximadamente hace apenas unos 1...


In [22]:
print("Train Shape = ",df_train.shape)
print("Test shape = ",df_test.shape)
print("List labels length = ", len(df_train['Label'].unique()))

Train Shape =  (38750, 4)
Test shape =  (38827, 3)
List labels length =  385


Ok - So we have 38K sentences in different languages, to classify in 390 categories. If the class is balanced, this would represent a 100:1 ratio, so ok to train without generating new sentences I assume. So first baseline algo will just train an NLP classifier on train dataset, and then use the test dataset to see how good it actually is.

# II - Bert

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# Assuming df_train is your dataset

# Step 1: Preprocessing and Splitting the Data
class ProportionalSplitter:
    @staticmethod
    def stratified_split(df, test_size=0.2):
        train, test = train_test_split(
            df, 
            test_size=test_size, 
            stratify=df['Label'], 
            random_state=42
        )
        return train, test

data = df_train.copy()
train_df, test_df = ProportionalSplitter.stratified_split(data)

# Encode labels
label_encoder = LabelEncoder()
train_df['Label'] = label_encoder.fit_transform(train_df['Label'])
test_df['Label'] = label_encoder.transform(test_df['Label'])

num_labels = len(label_encoder.classes_)

# Step 2: Define Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

# Step 3: Load Pretrained BERT Model and Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Prepare datasets
train_dataset = TextDataset(
    texts=train_df['Text'].tolist(), 
    labels=train_df['Label'].tolist(), 
    tokenizer=tokenizer, 
    max_length=128
)

test_dataset = TextDataset(
    texts=test_df['Text'].tolist(), 
    labels=test_df['Label'].tolist(), 
    tokenizer=tokenizer, 
    max_length=128
)

# Step 4: Define DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Step 5: Define Training and Evaluation Loops
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(dataloader), accuracy

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(dataloader), accuracy

# Step 6: Training Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training Loop
num_epochs = 3
for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, test_loader, criterion, device)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f}")
    print(f"Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_acc:.4f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1/3
Train Loss: 4.5484 | Train Accuracy: 0.1628
Validation Loss: 3.2984 | Validation Accuracy: 0.3406
Epoch 2/3
Train Loss: 2.6793 | Train Accuracy: 0.4285
Validation Loss: 2.0860 | Validation Accuracy: 0.5261
Epoch 3/3
Train Loss: 1.7161 | Train Accuracy: 0.6006
Validation Loss: 1.4894 | Validation Accuracy: 0.6354
