In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


In [2]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
import numpy as np
import pandas as pd

# Load your dataset (replace with actual loading code)
data = pd.read_csv('data/train_submission.csv')

# Map labels to numerical IDs
labels = data['Label'].unique()
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}
data['LabelID'] = data['Label'].map(label2id)

# Split the dataset into train/test/validation (80/10/10 split)
train_data = data.sample(frac=0.8, random_state=42)
remaining_data = data.drop(train_data.index)
val_data = remaining_data.sample(frac=0.5, random_state=42)
test_data = remaining_data.drop(val_data.index)

# Convert to Hugging Face Dataset format
def convert_to_dataset(df):
    return Dataset.from_pandas(df[['Text', 'LabelID']])

dataset = DatasetDict({
    'train': convert_to_dataset(train_data),
    'validation': convert_to_dataset(val_data),
    'test': convert_to_dataset(test_data)
})

# Load a multilingual model (e.g., XLM-Roberta)
# model_name = "xlm-roberta-base"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))


# Load a multilingual model (e.g., DistilBERT Multilingual)
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))

# Tokenize the dataset
def preprocess_function(examples):
    inputs = tokenizer(examples['Text'], truncation=True, padding=True, max_length=100)
    inputs["labels"] = examples["LabelID"]  # Ensure labels are included for loss calculation
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)



  from .autonotebook import tqdm as notebook_tqdm





Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 31083/31083 [00:02<00:00, 13733.89 examples/s]
Map: 100%|██████████| 3886/3886 [00:00<00:00, 11337.66 examples/s]
Map: 100%|██████████| 3885/3885 [00:00<00:00, 14974.56 examples/s]


In [3]:
# Define metrics for evaluation
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir="./best_model",  # Save only the best model
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Align save strategy with evaluation strategy
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    gradient_accumulation_steps=4,  # Reduced to fit memory constraints
    per_device_eval_batch_size=128,
    num_train_epochs=45,
    weight_decay=0.01,
    save_total_limit=1,  # Save only the best checkpoint
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True
)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate on the test set
test_results = trainer.evaluate(tokenized_datasets['test'])
print("Test Results:", test_results)


  trainer = Trainer(
Trainer is attempting to log a value of "{0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2', 3: 'LABEL_3', 4: 'LABEL_4', 5: 'LABEL_5', 6: 'LABEL_6', 7: 'LABEL_7', 8: 'LABEL_8', 9: 'LABEL_9', 10: 'LABEL_10', 11: 'LABEL_11', 12: 'LABEL_12', 13: 'LABEL_13', 14: 'LABEL_14', 15: 'LABEL_15', 16: 'LABEL_16', 17: 'LABEL_17', 18: 'LABEL_18', 19: 'LABEL_19', 20: 'LABEL_20', 21: 'LABEL_21', 22: 'LABEL_22', 23: 'LABEL_23', 24: 'LABEL_24', 25: 'LABEL_25', 26: 'LABEL_26', 27: 'LABEL_27', 28: 'LABEL_28', 29: 'LABEL_29', 30: 'LABEL_30', 31: 'LABEL_31', 32: 'LABEL_32', 33: 'LABEL_33', 34: 'LABEL_34', 35: 'LABEL_35', 36: 'LABEL_36', 37: 'LABEL_37', 38: 'LABEL_38', 39: 'LABEL_39', 40: 'LABEL_40', 41: 'LABEL_41', 42: 'LABEL_42', 43: 'LABEL_43', 44: 'LABEL_44', 45: 'LABEL_45', 46: 'LABEL_46', 47: 'LABEL_47', 48: 'LABEL_48', 49: 'LABEL_49', 50: 'LABEL_50', 51: 'LABEL_51', 52: 'LABEL_52', 53: 'LABEL_53', 54: 'LABEL_54', 55: 'LABEL_55', 56: 'LABEL_56', 57: 'LABEL_57', 58: 'LABEL_58', 59: 'LABEL_59

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,5.451004,0.202265,0.159461,0.212524,0.202265
2,No log,4.68608,0.386773,0.334021,0.387802,0.386773


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
