In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sbhatti/financial-sentiment-analysis")

print("Path to dataset files:", path)

Path to dataset files: /Users/nojeda/.cache/kagglehub/datasets/sbhatti/financial-sentiment-analysis/versions/4


In [3]:
# Load the dataset and display some values
df = pd.read_csv(f'{path}/data.csv')

In [4]:
df.isnull().sum()

Sentence     0
Sentiment    0
dtype: int64

In [5]:
df['Sentiment'].value_counts()

Sentiment
neutral     3130
positive    1852
negative     860
Name: count, dtype: int64

In [6]:
train, eval = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Sentiment'])
train.shape, eval.shape

((4673, 2), (1169, 2))

In [7]:
train.to_csv("data/train_subset.csv", index=False)
eval.to_csv("data/eval_subset.csv", index=False)

In [8]:
dataset = load_dataset('csv', data_files={'train': 'data/train_subset.csv', 'eval': 'data/eval_subset.csv'})

Generating train split: 4673 examples [00:00, 416533.47 examples/s]
Generating eval split: 1169 examples [00:00, 247720.98 examples/s]


In [9]:
def transform_labels(label):

    label = label['Sentiment']
    num = 0
    if label == 'negative': #'Negative'
        num = 0
    elif label == 'neutral': #'Neutral'
        num = 1
    elif label == 'positive': #'Positive'
        num = 2

    return {'labels': num}

dataset = dataset.map(transform_labels)

Map: 100%|██████████| 4673/4673 [00:00<00:00, 36877.98 examples/s]
Map: 100%|██████████| 1169/1169 [00:00<00:00, 51629.95 examples/s]


In [10]:
# let's train a Distilbert model

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# let's tokenize the data for the model to be able to understand
def tokenize_data(example):
    return tokenizer(example['Sentence'], padding='max_length')    

In [11]:
dataset = dataset.map(tokenize_data, batched=True)

Map: 100%|██████████| 4673/4673 [00:00<00:00, 11086.88 examples/s]
Map: 100%|██████████| 1169/1169 [00:00<00:00, 11211.25 examples/s]


In [12]:
from transformers import AutoModelForSequenceClassification

# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# let's set the training arguements

# the default batch size for training arguments
batch_size = 8

# set number of epochs
number_of_epochs = 7
# let set the logging steps
logging_steps = len(dataset['train']) // batch_size # it should log each batch 

steps = (len(dataset['train']) / batch_size) * number_of_epochs
warmup_steps = int(0.2 * steps)



from transformers import TrainingArguments
training_args = TrainingArguments(
                                  num_train_epochs=number_of_epochs, 
                                  load_best_model_at_end=True,
                                  evaluation_strategy='steps', 
                                  save_strategy='steps',
                                  learning_rate=2e-5,
                                  logging_steps=logging_steps,
                                  warmup_steps= warmup_steps,
                                  save_steps=1000,
                                  eval_steps=500,
                                  output_dir="fine-tuned-distilbert-base-uncased"
                                  )



In [14]:
# shuffle the datasets

train_dataset = dataset['train'].shuffle(seed=10) 
eval_dataset = dataset['eval'].shuffle(seed=10)

In [15]:
from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# Launch the learning process: training
# Load model from checkpoint
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Logging

Step,Training Loss,Validation Loss
500,No log,0.535736
1000,0.814900,0.473491
1500,0.456800,0.537131
2000,0.311800,0.680341
2500,0.228300,0.686948
3000,0.179300,0.76904
3500,0.179300,0.80909
4000,0.158000,0.854674


TrainOutput(global_step=4095, training_loss=0.3267479075180305, metrics={'train_runtime': 4240.5306, 'train_samples_per_second': 7.714, 'train_steps_per_second': 0.966, 'total_flos': 4333218352856064.0, 'train_loss': 0.3267479075180305, 'epoch': 7.0})

In [21]:
import numpy as np
from sklearn.metrics import mean_squared_error
import evaluate

def compute_metrics(eval_pred):
    # load the metrics to use
    load_accuracy = evaluate.load("accuracy")
    load_f1 = evaluate.load("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # calculate the mertic using the predicted and true value 
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)
    f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": accuracy, "f1score": f1}

In [22]:
trainer_eval = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [23]:
trainer_eval.evaluate()

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 4.51MB/s]
Downloading builder script: 100%|██████████| 6.79k/6.79k [00:00<00:00, 12.9MB/s]
Trainer is attempting to log a value of "{'accuracy': 0.8092386655260907}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.7849678206328778}" of type <class 'dict'> for key "eval/f1score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.4734910726547241,
 'eval_model_preparation_time': 0.0022,
 'eval_accuracy': {'accuracy': 0.8092386655260907},
 'eval_f1score': {'f1': 0.7849678206328778},
 'eval_runtime': 28.1866,
 'eval_samples_per_second': 41.474,
 'eval_steps_per_second': 5.215}

In [25]:
trainer.push_to_hub()
trainer_eval.push_to_hub()

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/nojedag/fine-tuned-distilbert-base-uncased/commit/680e63fd534b057c43c8de558bd9cbee89dc21e1', commit_message='End of training', commit_description='', oid='680e63fd534b057c43c8de558bd9cbee89dc21e1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nojedag/fine-tuned-distilbert-base-uncased', endpoint='https://huggingface.co', repo_type='model', repo_id='nojedag/fine-tuned-distilbert-base-uncased'), pr_revision=None, pr_num=None)

In [1]:
# Classify a new sentence
sentence = "I love this product"
inputs = tokenizer(sentence, return_tensors="pt")
outputs = model(**inputs)
predicted = torch.argmax(outputs.logits)
predicted_label = model.config.id2label[predicted.item()]
print(f"The sentiment of the sentence is: {predicted_label}")

NameError: name 'tokenizer' is not defined