# **Text mining: SENTIMENT ANALYSIS**

## 🎓 Master’s Program in Data Science & Advanced Analytics  
**Nova IMS** | March 2025  
**Course:** Business Cases with Data Science

## 👥 Team **Group 34**  
- **[Philippe Dutranoit]** | [20240518]  
- **[Diogo Duarte]** | [20240525]  
- **[Rui luz]** | [20211628]  
- **[Rodrigo Sardinha]** | [20211627]  

## 📊 Goal of the notebook

This notebook focuses on feature selection and engineering for our text-mining project: predicting market sentiment (Bearish, Bullish, Neutral) from Twitter data.  


# Imports

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from transformers import AutoTokenizer, RobertaForSequenceClassification, pipeline

import torch
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import classification_report, confusion_matrix

In [None]:
X_train = pd.read_csv('../Data/X_train.csv')
y_train = pd.read_csv('../Data/y_train.csv')
X_test = pd.read_csv('../Data/X_val.csv')
y_test = pd.read_csv('../Data/y_val.csv')

# Prepocessing 

In [None]:
# Define the model 
MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

## Preprocessing with a tokenizer

### Define tokenizer

In [None]:
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Use a pre-trained Pipeline

Here we are using the Huggingface pipeline("sentiment-analysis") to evaluate the performance of a pre-trained Transformer model on our validation set. The model is used in inference mode only, without being trained on our data. Providing a baseline to compare later with our own fine-tuned model.

In [None]:
# Load pipeline
classifier = pipeline(
    "sentiment-analysis",
    model=MODEL_NAME,
    tokenizer=MODEL_NAME,
    batch_size=16,
    device_map="auto",
    truncation=True
)

#### Run pipeline on validation set

In [None]:
# Prepare texts
val_texts = X_test["text"].tolist()

# Run pipeline on full validation set
preds = classifier(val_texts)

#### Map predicted labels to int

In [None]:
# Map label to int
label_map = {
    "negative": 0,  # Bearish
    "positive": 1,  # Bullish
    "neutral": 2    # Neutral
}

y_pred_pipeline = [label_map[pred['label']] for pred in preds]

#### Evaluate

In [None]:
# True labels
y_true = y_test["label"].tolist()

# Print evaluation metrics
print(classification_report(y_true, y_pred_pipeline, digits=4))

In [None]:
cm = confusion_matrix(y_true, y_pred_pipeline)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=[0,1,2], yticklabels=[0,1,2])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

## Model Training 

# Test

In [None]:
# Define reusable pipeline evaluation function
def evaluate_pipeline_model(model_name, val_texts, y_true):
    print(f"\n--- Evaluating model: {model_name} ---")
    
    # Some models do not support device_map='auto' → safe to try without
    if "distilbert" in model_name:
        classifier = pipeline(
            "sentiment-analysis",
            model=model_name,
            tokenizer=model_name,
            batch_size=16,
            truncation=True
        )
    else:
        classifier = pipeline(
            "sentiment-analysis",
            model=model_name,
            tokenizer=model_name,
            batch_size=16,
            device_map="auto",
            truncation=True
        )
    
    # Run pipeline
    preds = classifier(val_texts)
    
    # Choose correct label map depending on model
    # First, check what labels look like
    example_label = preds[0]['label']
    
    if example_label in ["negative", "positive", "neutral"]:
        label_map = {
            "negative": 0,
            "positive": 1,
            "neutral": 2
        }
    elif example_label in ["LABEL_0", "LABEL_1", "LABEL_2"]:
        label_map = {
            "LABEL_0": 0,
            "LABEL_1": 1,
            "LABEL_2": 2
        }
    elif example_label in ["NEG", "POS", "NEU"]:
        label_map = {
            "NEG": 0,
            "POS": 1,
            "NEU": 2
        }

    else:
        raise ValueError(f"Unknown label format: {example_label}")
    
    # Convert predictions
    y_pred_pipeline = [label_map[pred['label']] for pred in preds]
    
    # Print classification report
    print(classification_report(y_true, y_pred_pipeline, digits=4))

     # Plot confusion matrix
    cm = confusion_matrix(y_true, y_pred_pipeline)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=[0,1,2], yticklabels=[0,1,2])
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title(f"Confusion Matrix - {model_name}")
    plt.show()

In [None]:
# --- Run loop over your selected models ---

# Prepare texts and true labels once
val_texts = X_test["text"].tolist()
y_true = y_test["label"].tolist()

# List of models to test
model_list = [
    'cardiffnlp/twitter-roberta-base-sentiment-latest',
    'cardiffnlp/twitter-xlm-roberta-base-sentiment',
    'finiteautomata/bertweet-base-sentiment-analysis'
]

# Run evaluation for each model
for model_name in model_list:
    evaluate_pipeline_model(model_name, val_texts, y_true)

# Train our own Bert Model

### Encode the dataset

In [None]:
# Tokenize training set
train_encodings = tokenizer(
    X_train["text"].tolist(),   # convert to list of strings
    padding=True,               # pad to max length in batch
    truncation=True,            # truncate long tweets
    return_tensors="pt"         # return PyTorch tensors
)

In [None]:
# Tokenize validation set
val_encodings = tokenizer(
    X_test["text"].tolist(),
    padding=True,
    truncation=True,
    return_tensors="pt"
)

In [None]:
# Check shape of tokenized data
print("Train input_ids shape:", train_encodings["input_ids"].shape)
print("Validation input_ids shape:", val_encodings["input_ids"].shape)

### Dataset and DataLoader

In [None]:
# Custom Dataset class for our BERT model
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
        return item


#### Instantiate dataset

In [None]:
# Convert labels to list
train_labels = y_train["label"].tolist()
val_labels = y_test["label"].tolist()

# Create Dataset objects
train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)

#### Create DataLoaders

In [None]:
# Set batch size (typical: 16 or 32)
batch_size = 16

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Check one batch from train_loader
batch = next(iter(train_loader))
print(batch.keys())
print(batch['input_ids'].shape)
print(batch['attention_mask'].shape)
print(batch['labels'].shape)

## Model prep 

## Load the model

In [None]:
# Load model with 3 output labels
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

In [None]:
print(model)

## Setup model training

## Model Training 