In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


In [None]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
import numpy as np
import pandas as pd

# Load your dataset (replace with actual loading code)
data = pd.read_csv('data/train_submission.csv')

# Map labels to numerical IDs
labels = data['Label'].unique()
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}
data['LabelID'] = data['Label'].map(label2id)

# Split the dataset into train/test/validation (80/10/10 split)
train_data = data.sample(frac=0.8, random_state=42)
remaining_data = data.drop(train_data.index)
val_data = remaining_data.sample(frac=0.5, random_state=42)
test_data = remaining_data.drop(val_data.index)

# Convert to Hugging Face Dataset format
def convert_to_dataset(df):
    return Dataset.from_pandas(df[['Text', 'LabelID']])

dataset = DatasetDict({
    'train': convert_to_dataset(train_data),
    'validation': convert_to_dataset(val_data),
    'test': convert_to_dataset(test_data)
})

# Load a multilingual model (e.g., XLM-Roberta)
# model_name = "xlm-roberta-base"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))


# Load a multilingual model (e.g., DistilBERT Multilingual)
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))

# Tokenize the dataset
def preprocess_function(examples):
    inputs = tokenizer(examples['Text'], truncation=True, padding=True, max_length=100)
    inputs["labels"] = examples["LabelID"]  # Ensure labels are included for loss calculation
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)



In [None]:
# Define metrics for evaluation
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir="./best_model",  # Save only the best model
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Align save strategy with evaluation strategy
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    gradient_accumulation_steps=4,  # Reduced to fit memory constraints
    per_device_eval_batch_size=128,
    num_train_epochs=45,
    weight_decay=0.01,
    save_total_limit=1,  # Save only the best checkpoint
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True
)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate on the test set
test_results = trainer.evaluate(tokenized_datasets['test'])
print("Test Results:", test_results)
