# Kaggle competition

New Data available, in different format so we need to repreprocess everything

# 0 - Imports

In [15]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# I - Data Preprocessing

In [16]:
# df_train = pd.read_csv('Lexical_juggling_train.csv')
# df_train = pd.read_csv('train_submission.csv')
df_train = pd.read_excel('train_augmented.xlsx', sheet_name='Data')
df_train.dropna(subset=['Label'], inplace=True)
labels_with_multiple_rows = df_train['Label'].value_counts()
labels_to_keep = labels_with_multiple_rows[labels_with_multiple_rows > 1].index
df_train = df_train[df_train['Label'].isin(labels_to_keep)]
df_train['Text'] = df_train['Text'].astype(str)
df_train['Label'] = df_train['Label'].astype(str)
df_train.head(2)

Unnamed: 0,Text,Label
0,Finalment Atena le recibe en l'acropoli d'Ate...,arg
1,Jane Laffort fille de Joseph Laffort et d' Ang...,lat


In [17]:
df_test = pd.read_csv('test_without_labels.csv')
df_test.head(2)

Unnamed: 0,ID,Usage,Text
0,55,Private,Ponovo dobija riječni oblik do Drežnice.
1,71,Private,Se formaron aproximadamente hace apenas unos 1...


In [18]:
print("Train Shape = ",df_train.shape)
print("Test shape = ",df_test.shape)
print("List labels length = ", len(df_train['Label'].unique()))
# print(df_train['Label'].unique())

Train Shape =  (41149, 2)
Test shape =  (38827, 3)
List labels length =  385


Ok - So we have 38K sentences in different languages, to classify in 390 categories. If the class is balanced, this would represent a 100:1 ratio, so ok to train without generating new sentences I assume. So first baseline algo will just train an NLP classifier on train dataset, and then use the test dataset to see how good it actually is.

In [19]:
df_lab = pd.DataFrame(df_train)

label_counts = df_lab["Label"].value_counts().reset_index()
label_counts.columns = ["Label", "Count"]
print(label_counts)

    Label  Count
0     tgk    300
1     teo    205
2     wbm    203
3     hin    200
4     tat    200
..    ...    ...
380   xho    100
381   yao    100
382   hus    100
383   kau    100
384   ceb     99

[385 rows x 2 columns]


# II - Baseline Model

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

class TqdmTfidfVectorizer(TfidfVectorizer):
    def fit_transform(self, raw_documents, y=None):
        raw_documents = list(tqdm(raw_documents, desc="TFIDF Fit Transform"))
        return super().fit_transform(raw_documents, y)
    def transform(self, raw_documents):
        raw_documents = list(tqdm(raw_documents, desc="TFIDF Transform"))
        return super().transform(raw_documents)

# Assume df_train is your DataFrame with columns 'Text' and 'Label'
X = df_train['Text']
y = df_train['Label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipeline = Pipeline([
    ('tfidf', TqdmTfidfVectorizer(max_features=10000, min_df=5, max_df=0.8)),
    ('clf', LogisticRegression(solver='saga', max_iter=100, verbose=1))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


TFIDF Fit Transform: 100%|██████████| 32919/32919 [00:00<00:00, 1709810.08it/s]


convergence after 20 epochs took 18 seconds


TFIDF Transform: 100%|██████████| 8230/8230 [00:00<00:00, 525509.19it/s]


              precision    recall  f1-score   support

         abk       1.00      0.20      0.33        20
         ace       1.00      0.90      0.95        20
         ach       0.84      0.97      0.90        38
         acm       0.25      0.10      0.14        20
         acr       1.00      0.85      0.92        20
         ada       0.95      0.90      0.93        21
         afb       0.18      0.10      0.13        20
         afr       0.68      0.85      0.76        20
         ahk       0.95      1.00      0.98        20
         ajp       0.19      0.25      0.22        20
         aka       0.64      0.70      0.67        20
         aln       0.50      0.55      0.52        20
         als       0.33      0.25      0.29        20
         alt       1.00      0.70      0.82        20
         amh       1.00      0.50      0.67        20
         aoj       1.00      1.00      1.00        20
         apc       0.24      0.25      0.24        20
         ara       0.30    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# III - Bert

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# Assuming df_train is your dataset

# Step 1: Preprocessing and Splitting the Data
class ProportionalSplitter:
    @staticmethod
    def stratified_split(df, test_size=0.2):
        train, test = train_test_split(
            df, 
            test_size=test_size, 
            stratify=df['Label'], 
            random_state=42
        )
        return train, test

data = df_train.copy()
train_df, test_df = ProportionalSplitter.stratified_split(data)

# Encode labels
label_encoder = LabelEncoder()
train_df['Label'] = label_encoder.fit_transform(train_df['Label'])
test_df['Label'] = label_encoder.transform(test_df['Label'])

num_labels = len(label_encoder.classes_)

# Step 2: Define Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

# Step 3: Load Pretrained BERT Model and Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Prepare datasets
train_dataset = TextDataset(
    texts=train_df['Text'].tolist(), 
    labels=train_df['Label'].tolist(), 
    tokenizer=tokenizer, 
    max_length=128
)

test_dataset = TextDataset(
    texts=test_df['Text'].tolist(), 
    labels=test_df['Label'].tolist(), 
    tokenizer=tokenizer, 
    max_length=128
)

# Step 4: Define DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Step 5: Define Training and Evaluation Loops
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(dataloader), accuracy

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(dataloader), accuracy

# Step 6: Training Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training Loop
num_epochs = 10
for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, test_loader, criterion, device)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f}")
    print(f"Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_acc:.4f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Train Loss: 4.5153 | Train Accuracy: 0.1794
Validation Loss: 3.1784 | Validation Accuracy: 0.3425


In [49]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm import tqdm

data = df_train.copy()

def stratified_split(df, test_size=0.2):
    return train_test_split(df, test_size=test_size, stratify=df['Label'], random_state=42)

train_df, test_df = stratified_split(data)

label_encoder = LabelEncoder()
train_df['Label'] = label_encoder.fit_transform(train_df['Label'])
test_df['Label'] = label_encoder.transform(test_df['Label'])

num_labels = len(label_encoder.classes_)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

train_dataset = TextDataset(train_df['Text'].tolist(), train_df['Label'].tolist(), tokenizer, 128)
test_dataset = TextDataset(test_df['Text'].tolist(), test_df['Label'].tolist(), tokenizer, 128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    loop = tqdm(dataloader, desc="Training", leave=True, position=0, ncols=100)
    
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
        loop.set_postfix(loss=loss.item(), accuracy=correct/total)
    
    return total_loss / len(dataloader), correct / total

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    loop = tqdm(dataloader, desc="Evaluating", leave=True, position=0, ncols=100)

    with torch.no_grad():
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            loop.set_postfix(loss=loss.item(), accuracy=correct/total)
    
    return total_loss / len(dataloader), correct / total

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 10

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss, train_acc = train(model, train_loader, optimizer, device)
    val_loss, val_acc = evaluate(model, test_loader, device)
    print(f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f}")
    print(f"Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_acc:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10


Training:   0%|                           | 3/2058 [00:29<5:41:20,  9.97s/it, accuracy=0, loss=5.96]


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), "bert.pth")
print("Model saved as bert.pth")

Model saved as bert.pth


# III - Predictions

In [7]:
model.load_state_dict(torch.load("bert.pth"))
model.to(device)
model.eval()

# Add predicted labels to df_test
def predict_label(texts, model, tokenizer, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for text in texts:
            encoding = tokenizer(
                text,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=1).item()
            predictions.append(pred)

    return predictions

  model.load_state_dict(torch.load("bert.pth"))


In [8]:
# Predict and add to df_test
df_test['Label'] = predict_label(df_test['Text'].tolist(), model, tokenizer, device)
df_test['Label'] = label_encoder.inverse_transform(df_test['Label'])
print("Predicted labels added to df_test.")
df_test.head()

Predicted labels added to df_test.


Unnamed: 0,ID,Usage,Text,Label
0,55,Private,Ponovo dobija riječni oblik do Drežnice.,hrv
1,71,Private,Se formaron aproximadamente hace apenas unos 1...,spa
2,67,Private,Data juga harus terbebas dari kepentingan-kepe...,mad
3,107,Private,ᐃᒃᓯᕙᐅᑕᖅ (ᑐᓵᔨᑎᒍᑦ): ᖁᔭᓐᓇᒦᒃ ᒥᔅ ᐅᐃᓐᒥᐅᓪ. ᒥᔅᑕ ᐃᓄᒃ.,iku
4,129,Private,Bei Gefor rullt de Kéiseker sech an riicht se...,ltz


In [9]:
df_test.to_csv("Submission_louis_v2.csv")