# OCEAN Token Sentiment Analysis Challenge

### Text - Tweet Model Training (Hugging Face)
By Luca Ordronneau

In [1]:
import os
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('mps')
device

device(type='mps')

In [3]:
MODEL = f"distilbert-base-uncased"
NAME  = "distilbert-base-uncased-finetuned-OCEAN-sentiment"

In [4]:
dataset = load_dataset('csv', data_files={'train': '../data/train.csv', 'test': '../data/test.csv'})
dataset = dataset.class_encode_column("label")

Found cached dataset csv (/Users/lucaordronneau/.cache/huggingface/datasets/csv/default-e7006a8f1738f64d/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)
100%|██████████| 2/2 [00:00<00:00, 80.34it/s]
Loading cached processed dataset at /Users/lucaordronneau/.cache/huggingface/datasets/csv/default-e7006a8f1738f64d/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-118b9142fbf36f6f.arrow
Loading cached processed dataset at /Users/lucaordronneau/.cache/huggingface/datasets/csv/default-e7006a8f1738f64d/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-d42c58e7229d84e5.arrow
Loading cached processed dataset at /Users/lucaordronneau/.cache/huggingface/datasets/csv/default-e7006a8f1738f64d/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-e5967c733c894cea.arrow
Loading cached processed dataset at /Users/lucaordronneau/.cache/huggingface/datasets/csv/default-e7006a8f1738f64d/0.0.0/eea64c71ca8

In [5]:
from collections import Counter

count_labels = Counter(dataset["train"]['label'])
count_labels

Counter({2: 64898, 0: 49655, 1: 7763})

### Class Weights

In [6]:
c_0 = 1 - count_labels[0] / sum([count_labels[i] for i in range(len(count_labels))])
c_1 = 1 - count_labels[1] / sum([count_labels[i] for i in range(len(count_labels))])
c_2 = 1 - count_labels[2] / sum([count_labels[i] for i in range(len(count_labels))])

c_0, c_1, c_2

(0.5940432976879557, 0.9365332417672259, 0.4694234605448183)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

### Choose max lenght for tokenisation
Avoid big padding

In [8]:
token_lens = []

for txt in dataset["train"]["text"]:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))
    
max_len=np.max(token_lens)
max_len

225

In [9]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=max_len)

dataset_encoded = dataset.map(tokenize, batched=True)

Loading cached processed dataset at /Users/lucaordronneau/.cache/huggingface/datasets/csv/default-e7006a8f1738f64d/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-f25286d00d2e2dbe.arrow
Loading cached processed dataset at /Users/lucaordronneau/.cache/huggingface/datasets/csv/default-e7006a8f1738f64d/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-c63ebd54d0333704.arrow


In [10]:
dataset_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 122316
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 30579
    })
})

## Evaluation metric
### F1 Score

In [11]:
from sklearn.metrics import accuracy_score, f1_score

def computes_metrics(pred):
    labels = pred.label_ids
    preds  = pred.predictions.argmax(-1)
    f1     = f1_score(labels, preds, average="weighted")
    acc    = accuracy_score(labels, preds)
    return {"accuracy":acc, "f1":f1}

## Model configuration

In [12]:
NUM_LABELS = dataset_encoded["train"].features["label"].num_classes
model      = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=NUM_LABELS, ignore_mismatched_sizes=True)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.we

In [13]:
model.config.id2label = {0:'Bearish', 1:'Neutral', 2:'Bullish'}
model.config.label2id = {'Bearish':0, 'Neutral':1, 'Bullish':2}
model.config.num_labels = NUM_LABELS

In [14]:
model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "Bearish",
    "1": "Neutral",
    "2": "Bullish"
  },
  "initializer_range": 0.02,
  "label2id": {
    "Bearish": 0,
    "Bullish": 2,
    "Neutral": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.30.2",
  "vocab_size": 30522
}

In [15]:
API_KEY = ""

In [16]:
import torch
from torch import nn

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([c_0, c_1, c_2]).to(device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [17]:
device

device(type='mps')

### TRAINING HAS BEEN DONE on Kaggle, the result of the run are provided in the report
Link to the Hugging face model: https://huggingface.co/lucaordronneau/distilbert-base-uncased-finetuned-OCEAN-sentiment

In [None]:
batch_size = 48

training_args = TrainingArguments(
    output_dir=NAME,
    learning_rate=6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    disable_tqdm=False,
    weight_decay=0.01,
    # report_to="wandb",
    # run_name=NAME,
    # hub_token = API_KEY,
    # push_to_hub=True,
)

trainer = CustomTrainer(
    model=model.to(device),
    args=training_args,
    compute_metrics=computes_metrics,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["test"],
    tokenizer=tokenizer
)

trainer.train()