<table align="center">
  <a target="_blank" href="https://colab.research.google.com/github/martinlf6/schwab-ds-takehome-FengLiu/blob/main/03_models.ipynb">
        <img src="https://i.ibb.co/2P3SLwK/colab.png"  style="padding-bottom:5px;" />Run in Google Colab</a>
</table>

In [16]:
!pip install datasets==3.6.0 --force-reinstall


Collecting datasets==3.6.0
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from datasets==3.6.0)
  Using cached filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting numpy>=1.17 (from datasets==3.6.0)
  Using cached numpy-2.3.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting pyarrow>=15.0.0 (from datasets==3.6.0)
  Using cached pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets==3.6.0)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets==3.6.0)
  Using cached pandas-2.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting requests>=2.32.2 (from datasets==3.6.0)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tqdm>=4.66.3 (from datasets==3.6.0)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from dataset

In [1]:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
# Load dataset
ds = load_dataset("financial_phrasebank", "sentences_allagree") # 'sentences_allagree' means only sentences where all annotators agreed on the sentiment label are included in the loaded dataset. ds is now a DatasetDict object with splits like "train"
df = ds["train"].to_pandas().rename(columns={"sentence":"text","label":"y"}) # Convert to Pandas dataframe. ds["train"] selects the training split of the dataset
label_map = {0: "negative", 1: "neutral", 2: "positive"} # Create a mapping from numbers to labels
df["label"] = df["y"].map(label_map) # Apply the mapping: replaces each numeric value in column y with its text label and creates a new column label with human-readable sentiment.
df["len"] = df["text"].str.split().apply(len) # .split() splits each list in text column (sentence) on whitespace, another word, splits each sentence into words. apply(len) applies the built-in Python len() function to each list in text column (sentence) that gives the number of words in the sentence.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
df

Unnamed: 0,text,y,label,len
0,"According to Gran , the company has no plans t...",1,neutral,25
1,"For the last quarter of 2010 , Componenta 's n...",2,positive,39
2,"In the third quarter of 2010 , net sales incre...",2,positive,29
3,Operating profit rose to EUR 13.1 mn from EUR ...,2,positive,24
4,"Operating profit totalled EUR 21.1 mn , up fro...",2,positive,22
...,...,...,...,...
2259,Operating result for the 12-month period decre...,0,negative,27
2260,HELSINKI Thomson Financial - Shares in Cargote...,0,negative,40
2261,LONDON MarketWatch -- Share prices ended lower...,0,negative,26
2262,Operating profit fell to EUR 35.4 mn from EUR ...,0,negative,23


In [4]:
# Train/validation split: splits data into 80% train, 20% validation, stratified by label (balanced) ensuring that the proportion of each class (or label) is maintained in both subsets.
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["y"].tolist(), test_size=0.2, random_state=42, stratify=df["y"]
)


In [5]:
# Tokenization
MODEL_NAME = "ProsusAI/finbert"   # Loads FinBERT, a BERT model pre-trained on financial texts.
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Load the Tokenizer that will convert raw text (words/sentences) into numeric IDs that the model understands: input_ids (integer IDs) representing tokens and attention_mask (1s and 0s) showing which tokens are real vs. padding.

# Tokenizes sentences into IDs the model can process.
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128) # Truncation and padding guarantee that every input has exactly 128 tokens.
val_encodings   = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


In [6]:
# Convert to TensorFlow Dataset with each element as a tuple: Inputs (dict(train_encodings)) and Labels (np.array(train_labels)).
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    np.array(train_labels)
)).shuffle(1000).batch(16) # Randomly shuffles the training dataset with a buffer size of 1000 samples to prevent the model from just memorizing the order of the data. Groups 16 samples per batch for training, which speeds up learning and makes gradients more stable.

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    np.array(val_labels)
)).batch(32) # Use a bigger batch size (32) since no backpropagate during validation with a lower memory usage.


In [7]:
val_dataset

<_BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 73), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(None, 73), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 73), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [8]:
# Take a look at the first element of the validation dataset
for element in val_dataset.take(1):
    print(element)

({'input_ids': <tf.Tensor: shape=(32, 73), dtype=int32, numpy=
array([[ 101, 1996, 2194, ...,    0,    0,    0],
       [ 101, 1043, 8523, ...,    0,    0,    0],
       [ 101, 1999, 2244, ...,    0,    0,    0],
       ...,
       [ 101, 6983, 2810, ...,    0,    0,    0],
       [ 101, 2009, 2097, ...,    0,    0,    0],
       [ 101, 2122, 8777, ...,    0,    0,    0]],
      shape=(32, 73), dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(32, 73), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(32, 73), dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(32, 73), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], shape=(32, 73)