# **Text mining: SENTIMENT ANALYSIS**

## 🎓 Master’s Program in Data Science & Advanced Analytics  
**Nova IMS** | March 2025  
**Course:** Business Cases with Data Science

## 👥 Team **Group 34**  
- **[Philippe Dutranoit]** | [20240518]  
- **[Diogo Duarte]** | [20240525]  
- **[Rui luz]** | [20211628]  
- **[Rodrigo Sardinha]** | [20211627]  

## 📊 Goal of the notebook

This notebook focuses on feature selection and engineering for our text-mining project: predicting market sentiment (Bearish, Bullish, Neutral) from Twitter data.  


# Imports

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from transformers import AutoTokenizer, RobertaForSequenceClassification

import torch
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
X_train = pd.read_csv('../Data/X_train.csv')
y_train = pd.read_csv('../Data/y_train.csv')
X_test = pd.read_csv('../Data/X_val.csv')
y_test = pd.read_csv('../Data/y_val.csv')

# Prepocessing 

In [3]:
# Define the model 
MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

## Preprocessing with a tokenizer

### Define tokenizer

In [4]:
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



### Encode the dataset

In [5]:
# Tokenize training set
train_encodings = tokenizer(
    X_train["text"].tolist(),   # convert to list of strings
    padding=True,               # pad to max length in batch
    truncation=True,            # truncate long tweets
    return_tensors="pt"         # return PyTorch tensors
)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [6]:
# Tokenize validation set
val_encodings = tokenizer(
    X_test["text"].tolist(),
    padding=True,
    truncation=True,
    return_tensors="pt"
)

In [7]:
# Check shape of tokenized data
print("Train input_ids shape:", train_encodings["input_ids"].shape)
print("Validation input_ids shape:", val_encodings["input_ids"].shape)

Train input_ids shape: torch.Size([7634, 105])
Validation input_ids shape: torch.Size([1909, 83])


### Dataset and DataLoader

In [8]:
# Custom Dataset class for our BERT model
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
        return item


#### Instantiate dataset

In [9]:
# Convert labels to list
train_labels = y_train["label"].tolist()
val_labels = y_test["label"].tolist()

# Create Dataset objects
train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)

#### Create DataLoaders

In [10]:
# Set batch size (typical: 16 or 32)
batch_size = 16

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [11]:
# Check one batch from train_loader
batch = next(iter(train_loader))
print(batch.keys())
print(batch['input_ids'].shape)
print(batch['attention_mask'].shape)
print(batch['labels'].shape)

dict_keys(['input_ids', 'attention_mask', 'labels'])
torch.Size([16, 105])
torch.Size([16, 105])
torch.Size([16])


# Model prep 

## Load the model

In [12]:
# Load model with 3 output labels
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
print(model)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

## Setup model training

# Model Training 