- Feature Engineering extra method: +0.5 points for using additional transformer embedding, besides the mandatory one.
- Classification Models extra method (encoder): +1 point for using another transformer encoder model (RoBERTa), besides the mandatory one.

In [None]:
import pickle
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [13]:
# Load the train/val split from the pickle file
with open('train_val_split.pkl', 'rb') as f:
    data = pickle.load(f)

# Convert DataFrames to list
train_texts = data['x_train']['text'].tolist()
val_texts = data['x_val']['text'].tolist()

# Convert Series to list
train_labels = data['y_train'].tolist()
val_labels = data['y_val'].tolist()

# Check
print("Train samples:", len(train_texts), len(train_labels))
print("Validation samples:", len(val_texts), len(val_labels))

Train samples: 7634 7634
Validation samples: 1909 1909


In [14]:
# Prepare Hugging Face Datasets
train_ds = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_ds = Dataset.from_dict({"text": val_texts, "label": val_labels})
dataset = DatasetDict({"train": train_ds, "validation": val_ds})

In [15]:
# Load RoBERTa tokenizer and model (no sentiment head)
model_checkpoint = "cardiffnlp/twitter-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Tokenize
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding=True, max_length=128)

dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/7634 [00:00<?, ? examples/s]

Map:   0%|          | 0/1909 [00:00<?, ? examples/s]

In [None]:
# Define metrics
def compute_metrics(pred):
    preds = pred.predictions.argmax(-1)
    labels = pred.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",                   # Where model checkpoints and logs go
    learning_rate=2e-5,                       # Standard learning rate for fine-tuning
    per_device_train_batch_size=16,           # Small enough for most GPUs
    per_device_eval_batch_size=32,            # Larger eval batch size is okay for speed
    num_train_epochs=4,                       # 4 epochs — sufficient for many transformer tasks
    weight_decay=0.01,                        # Regularization to avoid overfitting
    logging_dir="./logs",                     # Logs directory
    logging_steps=100,                        # Log every 100 steps
    eval_strategy="epoch",                    # Evaluate at the end of every epoch
    save_strategy="epoch",                    # Save model at the end of every epoch
    load_best_model_at_end=True,              # Keep best model based on metric
    metric_for_best_model="f1",               # Use F1 to choose the best model
    greater_is_better=True,                   # Because higher F1 is better
    report_to=[]                              # Disable WandB or other loggers
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save model and tokenizer
trainer.save_model("roberta-market-sentiment")
tokenizer.save_pretrained("roberta-market-sentiment")

Epoch,Training Loss,Validation Loss


In [None]:
# Evaluate on training set
train_metrics = trainer.evaluate(eval_dataset=dataset["train"])
print("\nTraining Set Evaluation Metrics:")
for key, value in train_metrics.items():
    print(f"{key}: {value:.4f}")

# Evaluate on validation set
val_metrics = trainer.evaluate(eval_dataset=dataset["validation"])
print("\nValidation Set Evaluation Metrics:")
for key, value in val_metrics.items():
    print(f"{key}: {value:.4f}")

In [None]:
# Load the test set
test_data = pd.read_csv("test.csv")

In [None]:
# Prepare the Hugging Face Dataset
test_ds = Dataset.from_dict({"text": test_data["text"].tolist()})
test_ds = test_ds.map(tokenize, batched=True)

# Predict labels
predictions = trainer.predict(test_dataset=test_ds)
pred_labels = predictions.predictions.argmax(-1)

Map:   0%|          | 0/2388 [00:00<?, ? examples/s]

In [None]:
# Create submission file
submission = pd.DataFrame({
    "id": test_data["id"],
    "label": pred_labels
})

# Save submission
submission.to_csv("pred_25.csv", index=False)

In [None]:
pred_25 = pd.read_csv("pred_25.csv")
pred_25.head()

Unnamed: 0,id,label
0,0,1
1,1,2
2,2,2
3,3,1
4,4,2
