In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


In [None]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
import numpy as np
import pandas as pd
import mlflow

data = pd.read_csv('data/train_submission.csv')

# Map labels to numerical IDs
labels = data['Label'].unique()
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}
data['LabelID'] = data['Label'].map(label2id)

# Ensure classes with only 1-2 occurrences are moved to train
class_counts = data['LabelID'].value_counts()
rare_classes = class_counts[class_counts <= 2].index.tolist()
rare_data = data[data['LabelID'].isin(rare_classes)]
data = data[~data['LabelID'].isin(rare_classes)]

# Check if the remaining data allows for a stratified split
if data['LabelID'].nunique() > 1:
    train_data, temp_data = train_test_split(data, test_size=0.2, stratify=data['LabelID'], random_state=42)
    val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['LabelID'], random_state=42)
else:
    train_data = data
    val_data = pd.DataFrame(columns=data.columns)
    test_data = pd.DataFrame(columns=data.columns)

# Add rare classes back to train
train_data = pd.concat([train_data, rare_data])


# Convert to Hugging Face Dataset format
def convert_to_dataset(df):
    return Dataset.from_pandas(df[['Text', 'LabelID']])

dataset = DatasetDict({
    'train': convert_to_dataset(train_data),
    'validation': convert_to_dataset(val_data),
    'test': convert_to_dataset(test_data)
})

# Load a multilingual model
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))

# Tokenize the dataset
def preprocess_function(examples):
    inputs = tokenizer(examples['Text'], truncation=True, padding=True, max_length=100)
    inputs["labels"] = examples["LabelID"]  # Ensure labels are included for loss calculation
    return inputs


In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Define metrics for evaluation
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir="./best_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=256,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=256,
    num_train_epochs=2,
    weight_decay=0,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True
)

# Initialize MLflow
mlflow.set_experiment("transformers_classification")
with mlflow.start_run():
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    
    # Evaluate on the test set
    test_results = trainer.evaluate(tokenized_datasets['test'])
    print("Test Results:", test_results)
    
    # Log metrics to MLflow
    mlflow.log_metrics(test_results)
    mlflow.pytorch.log_model(model, "transformers_model")
