Fine-Tuning TimeLMLARGE for the Task of Emoji Prediction


In [None]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForMaskedLM
from torch.utils.data import DataLoader, TensorDataset
import torch
from transformers import RobertaForSequenceClassification, AdamW, get_scheduler
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification

# Define file paths
filepath = ""
train_path = filepath + "emoji-train.csv"
validation_path = filepath + "emoji-validation.csv"
test_path = filepath + "emoji-test.csv"

# Load datasets
train_data = pd.read_csv(train_path)
validation_data = pd.read_csv(validation_path)
test_data = pd.read_csv(test_path)

# Display the first few rows of each dataset
# print("Training Dataset:")
# print(train_data.head())
# print("\nValidation Dataset:")
# print(validation_data.head())
# print("\nTest Dataset:")
# print(test_data.head())


To remove the noise before training, some preprocessing steps should ne taken:



1.   Removing the URLs
2.   Removing the mentions
3. removing the special characters

this is done by The Regular Expression Library






In [None]:
# Preprocessing function to clean the text
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    # Remove mentions
    text = re.sub(r"@\w+", "", text)
    # Remove special characters
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text

# Apply preprocessing
train_data['cleaned_text'] = train_data['text'].apply(preprocess_text)
validation_data['cleaned_text'] = validation_data['text'].apply(preprocess_text)
test_data['cleaned_text'] = test_data['text'].apply(preprocess_text)

# Display preprocessed examples
# print(train_data[['text', 'cleaned_text']].head())

As the dataset is cleaned now, the AutoTokenizer is designated to convert the text in a form the TimeLMLARGE understands

In [None]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-large-2022-154m")

# Tokenization function
def tokenize_data(df, text_col, label_col, max_length=128):
    tokens = tokenizer(
        df[text_col].tolist(),
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    labels = df[label_col].tolist()
    return tokens, labels

# Tokenize datasets
train_tokens, train_labels = tokenize_data(train_data, "cleaned_text", "gold_label")
validation_tokens, validation_labels = tokenize_data(validation_data, "cleaned_text", "gold_label")
test_tokens, test_labels = tokenize_data(test_data, "cleaned_text", "gold_label")

As the task is complicated, the batch size for the training set is relatively small. This helps avoiding aggressive learning and digesting the linguistic nuances of the dataset

In [None]:
# Create TensorDatasets
train_dataset = TensorDataset(train_tokens['input_ids'], train_tokens['attention_mask'], torch.tensor(train_labels))
validation_dataset = TensorDataset(validation_tokens['input_ids'], validation_tokens['attention_mask'], torch.tensor(validation_labels))
test_dataset = TensorDataset(test_tokens['input_ids'], test_tokens['attention_mask'], torch.tensor(test_labels))

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In this code block, the model start learning with the following hyperparamters:
1. 15 epochs as the task a relative number of emojis becuase the number of emojis are 100
2. 5e-6 learning rate


In [None]:
# Load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-large-2022-154m", num_labels = 100)
model.to(device)

# Compute class weights for imbalanced data
class_counts = train_data['gold_label'].value_counts()
class_weights = torch.tensor(1.0 / class_counts).to(device)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-6)
num_training_steps = len(train_loader) * 15  # For 15 epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training loop
epochs = 15
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f"Epoch {epoch+1}/{epochs} - Training Loss: {total_loss / len(train_loader)}")


In [None]:
# Function to output predictions to save to file, and evaluate Top-1 and Top-5 accuracy
def evaluate(model, loader):
    model.eval()
    total, correct_top1, correct_top5 = 0, 0, 0
    all_top1_preds, all_top5_preds, all_labels = [], [], []

    with torch.no_grad():
        for e, batch in enumerate(loader):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs = F.softmax(logits, dim=1)

            # Top-1 accuracy
            top1_preds = torch.argmax(probs, dim=1)
            correct_top1 += (top1_preds == labels).sum().item()

            # Top-5 accuracy
            top5_preds = torch.topk(probs, k=5, dim=1).indices
            correct_top5 += torch.sum(torch.any(top5_preds == labels.unsqueeze(1), dim=1)).item()

            # Collect all predictions together
            all_top1_preds.extend(top1_preds.cpu().numpy())
            all_top5_preds.extend(top5_preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            total += labels.size(0)

    top1_accuracy = correct_top1 / total
    top5_accuracy = correct_top5 / total
    return top1_accuracy, top5_accuracy, all_top1_preds, all_top5_preds, all_labels

# Evaluate on the separate test dataset
test_top1, test_top5, all_t1, all_t5, all_lbls = evaluate(model, test_loader)
print(f"Test Top-1 Accuracy: {test_top1:.4f}, Test Top-5 Accuracy: {test_top5:.4f}")

# for top1, top5, label in list(zip(t1, t5, lbls))[:10]:
#   print(f"Top1: {top1}, Top5: {top5}, Label: {label}")

# Save all predictions to csv file
df_results = pd.DataFrame(columns=['top1', 'top5', 'label'], data=list(zip(all_t1, all_t5, all_lbls)))
df_results.to_csv('results_timelm.csv')