# Kaggle competition

New Data available, in different format so we need to repreprocess everything

# 0 - Imports

In [1]:
import pandas as pd
import numpy as np
import torch

# I - Data Preprocessing

In [2]:
# df_train = pd.read_csv('Lexical_juggling_train.csv')
df_train = pd.read_csv('Lexical_juggling_train.csv')
df_train.dropna(subset=['Label'], inplace=True)
labels_with_multiple_rows = df_train['Label'].value_counts()
labels_to_keep = labels_with_multiple_rows[labels_with_multiple_rows > 1].index
df_train = df_train[df_train['Label'].isin(labels_to_keep)]
df_train['Text'] = df_train['Text'].astype(str)
df_train['Label'] = df_train['Label'].astype(str)
df_train.head(2)

Unnamed: 0,ID,Usage,Text,Label
0,136,Public,Finalment Atena le recibe en l'acropoli d'Ate...,arg
1,62,Public,Jane Laffort fille de Joseph Laffort et d' Ang...,lat


In [3]:
df_test = pd.read_csv('test_without_labels.csv')
df_test.head(2)

Unnamed: 0,ID,Usage,Text
0,55,Private,Ponovo dobija riječni oblik do Drežnice.
1,71,Private,Se formaron aproximadamente hace apenas unos 1...


In [4]:
print("Train Shape = ",df_train.shape)
print("Test shape = ",df_test.shape)
print("List labels length = ", len(df_train['Label'].unique()))

Train Shape =  (77900, 4)
Test shape =  (38827, 3)
List labels length =  389


Ok - So we have 38K sentences in different languages, to classify in 390 categories. If the class is balanced, this would represent a 100:1 ratio, so ok to train without generating new sentences I assume. So first baseline algo will just train an NLP classifier on train dataset, and then use the test dataset to see how good it actually is.

In [5]:
df_lab = pd.DataFrame(df_train)

label_counts = df_lab["Label"].value_counts().reset_index()
label_counts.columns = ["Label", "Count"]
print(label_counts)

    Label  Count
0     tgk    300
1     arg    200
2     san    200
3     kon    200
4     wal    200
..    ...    ...
384   hus    200
385   sun    200
386   mlg    200
387   kir    200
388   toi    200

[389 rows x 2 columns]


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from datasets import Dataset
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score

train_df, test_df = train_test_split(df_train, test_size=0.2, stratify=df_train["Label"], random_state=42)
labels = sorted(train_df["Label"].unique())
label2id = {label: i for i, label in enumerate(labels)}
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
def map_labels(example):
    example["labels"] = label2id[example["Label"]]
    return example
train_dataset = train_dataset.map(map_labels)
test_dataset = test_dataset.map(map_labels)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True, padding="max_length", max_length=128)
train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=1)
test_dataset = test_dataset.map(tokenize_function, batched=True, num_proc=1)
columns = ["input_ids", "attention_mask", "labels"]
train_dataset.set_format(type="torch", columns=columns)
test_dataset.set_format(type="torch", columns=columns)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)
num_labels = len(label2id)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
for param in model.bert.parameters():
    param.requires_grad = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    step = 0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        total_loss += loss.item()
        step += 1
        progress_bar.update(1)
        if step % 10 == 0:
            print(f"Epoch {epoch+1} Step {step} Loss {loss.item():.4f}")
    avg_loss = total_loss / step
    print(f"Epoch {epoch+1} Average Training Loss: {avg_loss:.4f}")
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())
    acc = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1} Test Accuracy: {acc:.4f}")


Map: 100%|██████████| 62320/62320 [00:05<00:00, 11201.24 examples/s]
Map: 100%|██████████| 15580/15580 [00:01<00:00, 12448.38 examples/s]
Map: 100%|██████████| 62320/62320 [00:07<00:00, 7931.05 examples/s]
Map: 100%|██████████| 15580/15580 [00:01<00:00, 8741.58 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 23%|██▎       | 2721/11685 [16:32<54:30,  2.74it/s]
  0%|          | 10/11685 [00:03<55:52,  3.48it/s] 

Epoch 1 Step 10 Loss 6.0131


  0%|          | 20/11685 [00:05<54:55,  3.54it/s]

Epoch 1 Step 20 Loss 5.9816


  0%|          | 30/11685 [00:08<54:45,  3.55it/s]

Epoch 1 Step 30 Loss 6.0841


  0%|          | 40/11685 [00:11<55:09,  3.52it/s]

Epoch 1 Step 40 Loss 6.2577


  0%|          | 50/11685 [00:14<54:48,  3.54it/s]

Epoch 1 Step 50 Loss 6.1158


  1%|          | 60/11685 [00:17<54:50,  3.53it/s]

Epoch 1 Step 60 Loss 6.0773


  1%|          | 70/11685 [00:20<54:58,  3.52it/s]

Epoch 1 Step 70 Loss 6.0502


  1%|          | 80/11685 [00:23<55:17,  3.50it/s]

Epoch 1 Step 80 Loss 6.1022


  1%|          | 90/11685 [00:25<55:14,  3.50it/s]

Epoch 1 Step 90 Loss 6.0455


  1%|          | 100/11685 [00:28<54:48,  3.52it/s]

Epoch 1 Step 100 Loss 6.0210


  1%|          | 110/11685 [00:31<54:54,  3.51it/s]

Epoch 1 Step 110 Loss 6.2500


  1%|          | 120/11685 [00:34<54:58,  3.51it/s]

Epoch 1 Step 120 Loss 5.9826


  1%|          | 130/11685 [00:37<54:45,  3.52it/s]

Epoch 1 Step 130 Loss 6.1270


  1%|          | 134/11685 [00:38<54:45,  3.52it/s]

KeyboardInterrupt: 

# II - Bert

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification,DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm import tqdm

# Assuming df_train is your dataset

# Step 1: Preprocessing and Splitting the Data
class ProportionalSplitter:
    @staticmethod
    def stratified_split(df, test_size=0.2):
        train, test = train_test_split(
            df, 
            test_size=test_size, 
            stratify=df['Label'], 
            random_state=42
        )
        return train, test

data = df_train.copy()
train_df, test_df = ProportionalSplitter.stratified_split(data)

# Encode labels
label_encoder = LabelEncoder()
train_df['Label'] = label_encoder.fit_transform(train_df['Label'])
test_df['Label'] = label_encoder.transform(test_df['Label'])

num_labels = len(label_encoder.classes_)

# Step 2: Define Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

# Step 3: Load Pretrained BERT Model and Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

# Prepare datasets
train_dataset = TextDataset(
    texts=train_df['Text'].tolist(), 
    labels=train_df['Label'].tolist(), 
    tokenizer=tokenizer, 
    max_length=80
)

test_dataset = TextDataset(
    texts=test_df['Text'].tolist(), 
    labels=test_df['Label'].tolist(), 
    tokenizer=tokenizer, 
    max_length=80
)

# Step 4: Define DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

for i, layer in enumerate(model.bert.encoder.layer):
    if i < 10:
        for param in layer.parameters():
            param.requires_grad = False

# Step 5: Define Training and Evaluation Loops
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    loop = tqdm(dataloader, desc="Training", leave=True)

    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
        loop.set_postfix(loss=loss.item(), accuracy=correct/total)

    accuracy = correct / total
    return total_loss / len(dataloader), accuracy

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(dataloader), accuracy

# Step 6: Training Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training Loop
num_epochs = 3
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, test_loader, criterion, device)

    print(f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f}")
    print(f"Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_acc:.4f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

KeyboardInterrupt: 

In [6]:
torch.save(model.state_dict(), "bert.pth")
print("Model saved as bert.pth")

Model saved as bert.pth


# III - Predictions

In [7]:
model.load_state_dict(torch.load("bert.pth"))
model.to(device)
model.eval()

# Add predicted labels to df_test
def predict_label(texts, model, tokenizer, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for text in texts:
            encoding = tokenizer(
                text,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=1).item()
            predictions.append(pred)

    return predictions

  model.load_state_dict(torch.load("bert.pth"))


In [8]:
# Predict and add to df_test
df_test['Label'] = predict_label(df_test['Text'].tolist(), model, tokenizer, device)
df_test['Label'] = label_encoder.inverse_transform(df_test['Label'])
print("Predicted labels added to df_test.")
df_test.head()

Predicted labels added to df_test.


Unnamed: 0,ID,Usage,Text,Label
0,55,Private,Ponovo dobija riječni oblik do Drežnice.,hrv
1,71,Private,Se formaron aproximadamente hace apenas unos 1...,spa
2,67,Private,Data juga harus terbebas dari kepentingan-kepe...,mad
3,107,Private,ᐃᒃᓯᕙᐅᑕᖅ (ᑐᓵᔨᑎᒍᑦ): ᖁᔭᓐᓇᒦᒃ ᒥᔅ ᐅᐃᓐᒥᐅᓪ. ᒥᔅᑕ ᐃᓄᒃ.,iku
4,129,Private,Bei Gefor rullt de Kéiseker sech an riicht se...,ltz


In [9]:
df_test.to_csv("Submission_louis.csv")