In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
import numpy as np
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight
import torch

data = pd.read_csv('data/train_submission.csv')
print(len(data.Label.unique()))
data = data.dropna()
print(data.shape)
data = data.drop(columns=['Usage'])
data_augmented = pd.read_csv('data/aug_data.csv')
data = pd.concat([data, data_augmented], ignore_index=True)
data['numWord'] = data['Text'].apply(lambda x: len(str(x).split()))

In [None]:
import re

def remove_urls():
    url_pattern = r'https://\S+'
    counter = 0
# Loop through the 'Text' column to remove URLs
    for index, row in data.iterrows():
        original_text = row['Text']
    
    # Find all URLs in the text
        urls = re.findall(url_pattern, original_text)
    
    # Print and remove each URL from the text
        if urls:
            for url in urls:
                # print(f"Removing URL: {url}")
                counter += 1
        
        # Remove URLs from the text
            cleaned_text = re.sub(url_pattern, '', original_text)
        
        # Update the DataFrame with the cleaned text
            data.at[index, 'Text'] = cleaned_text
    print(f"Removed {counter} URLs from the 'Text' column.")

REM_URL = False
if REM_URL:
    remove_urls()

In [None]:
import random
# set random seed for reproducibility
random.seed(1)

label_counts = data.Label.value_counts()
data_aug = pd.DataFrame(columns=['Text', 'Label'])

for label, count in label_counts[label_counts < 10].items():
    label_data = data[data['Label'] == label]
    current_count = len(label_data)

    idx = 0 
    while current_count < 10:
        # Get the current example in sequential order
        example = label_data.iloc[idx]['Text']
        
        # Split the example into words
        words = example.split()
        
        # Shuffle the words
        random.shuffle(words)
        
        # Reconstruct the sentence from shuffled words
        new_example = ' '.join(words)
        
        # Create a new DataFrame for the new example
        new_data = pd.DataFrame({'Text': [new_example], 'Label': [label]})
        
        # Concatenate the new data to the original DataFrame
        data_aug = pd.concat([data_aug, new_data], ignore_index=True)
        current_count += 1
        
        # Move to the next example in the list (sequential)
        idx += 1
        if idx >= len(label_data):  # If we've processed all examples, restart from the beginning
            idx = 0
print(data_aug)

data = pd.concat([data, data_aug], ignore_index=True)

In [None]:
label_counts = data['Label'].value_counts()

REMOVE_OCCURENCES = True

if REMOVE_OCCURENCES:
    for label, count in label_counts.items():
        if count > 500:
            # Get the rows for the current label
            label_data = data[data['Label'] == label]
            
            # Number of rows to delete to make the count 500
            rows_to_delete = count - 500
            # print(f"Label: {label} | Rows to delete: {rows_to_delete}")
            
            # Randomly shuffle the rows of this label

            rows_to_delete_indices = random.sample(label_data.index.tolist(), rows_to_delete)
            
            # Delete the rows from the DataFrame
            data = data.drop(rows_to_delete_indices)
            
            # Print how many rows were deleted for this label
            print(f"Deleted {rows_to_delete} rows for label '{label}'")

In [None]:
MODEL_PATH_NAME = "test_intfloat" # TO CHANGE FOR EACH MODEL AT CONVENIENCE 

In [None]:
import json

labels = data['Label'].unique()
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}
data['LabelID'] = data['Label'].map(label2id)

mappings = {"label2id": label2id, "id2label": id2label}

path_mapping = MODEL_PATH_NAME + "_mappings.json"
with open(path_mapping, "w") as f:
    json.dump(mappings, f, indent=4)

import shutil
shutil.move(path_mapping, "data/mapping/" + path_mapping)

In [None]:
X = data["Text"]
y = data["LabelID"]

train_texts, test_texts, train_labels, test_labels = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
val_texts, test_texts, val_labels, test_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=42, stratify=test_labels)

train_dataset = Dataset.from_pandas(pd.DataFrame({'Text': train_texts, 'LabelID': train_labels}))
val_dataset = Dataset.from_pandas(pd.DataFrame({'Text': val_texts, 'LabelID': val_labels}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'Text': test_texts, 'LabelID': test_labels}))

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [None]:
model_name = "intfloat/multilingual-e5-large-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))

In [None]:
def preprocess_function(examples):
    inputs = tokenizer(examples['Text'], truncation=True, padding=True, max_length=100)
    inputs["labels"] = examples["LabelID"]
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [None]:
# Define metrics for evaluation
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./" + MODEL_PATH_NAME,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,  
    gradient_accumulation_steps=2,  
    per_device_eval_batch_size=32,
    num_train_epochs=5,  
    # weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    greater_is_better=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] 
)

# Train the model
trainer.train()

# Evaluate on the test set
test_results = trainer.evaluate(tokenized_datasets['test'])
print("Test Results:", test_results)