In [78]:
import polars as pl

df = pl.read_csv("/kaggle/input/finance-news-sentiments/dataset.csv")
df

sentiment,text
str,str
"""positive""","""All banks, lending institution…"
"""neutral""","""Not so fast, Drake..."""
"""positive""","""FNF - ong 19.43. Trailing Sto…"
"""positive""","""Dow opens down almost 500 poin…"
"""positive""","""U.S. weekly active oil-rig cou…"
…,…
"""negative""","""Airbnb to lose £325m in London…"
"""positive""","""Tesla surpasses 2019 goal and …"
"""positive""","""Hundreds of French workers at …"
"""neutral""","""Emerging-economy central banks…"


In [79]:
df = df.rename({"sentiment": "labels"})
df = df.with_columns(
    pl.when(pl.col("labels") == "negative").then(0)
    .when(pl.col("labels") == "neutral").then(1)
    .when(pl.col("labels") == "positive").then(2)
    .otherwise(None)
    .alias("labels")
)
df

labels,text
i32,str
2,"""All banks, lending institution…"
1,"""Not so fast, Drake..."""
2,"""FNF - ong 19.43. Trailing Sto…"
2,"""Dow opens down almost 500 poin…"
2,"""U.S. weekly active oil-rig cou…"
…,…
0,"""Airbnb to lose £325m in London…"
2,"""Tesla surpasses 2019 goal and …"
2,"""Hundreds of French workers at …"
1,"""Emerging-economy central banks…"


In [80]:
from datasets import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

def tokenize_batch(batch):
    texts = [str(text) for text in batch["text"]]
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

tokenized_dataset = Dataset.from_polars(df).map(tokenize_batch, batched=True)
tokenized_dataset

Map:   0%|          | 0/32583 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 32583
})

In [81]:
from datasets import DatasetDict

train_df = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
val_test_df = train_df["test"].train_test_split(test_size=0.5, seed=42)

dataset_splits = DatasetDict({
    'train': train_df['train'],
    'validation': val_test_df['train'],
    'test': val_test_df['test']
})
dataset_splits

DatasetDict({
    train: Dataset({
        features: ['labels', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 26066
    })
    validation: Dataset({
        features: ['labels', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3258
    })
    test: Dataset({
        features: ['labels', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3259
    })
})

In [82]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer

model = BertForSequenceClassification.from_pretrained(
    "nlptown/bert-base-multilingual-uncased-sentiment",
    num_labels=3,
    ignore_mismatched_sizes=True
)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_splits['train'],
    eval_dataset=dataset_splits['validation'],
    data_collator=data_collator
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6837,0.630348
2,0.5214,0.629911
3,0.385,0.694007


TrainOutput(global_step=4890, training_loss=0.5545690930450377, metrics={'train_runtime': 1102.5231, 'train_samples_per_second': 70.926, 'train_steps_per_second': 4.435, 'total_flos': 5143735759873536.0, 'train_loss': 0.5545690930450377, 'epoch': 3.0})

In [84]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.6940070986747742, 'eval_runtime': 11.5249, 'eval_samples_per_second': 282.693, 'eval_steps_per_second': 17.701, 'epoch': 3.0}


In [85]:
from transformers import pipeline

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

texts = [
    "The company's quarterly earnings exceeded expectations.",
    "The product received negative reviews from customers."
]

predictions = classifier(texts)
for text, pred in zip(texts, predictions):
    print(f"Text: {text}\nPrediction: {pred}\n")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Text: The company's quarterly earnings exceeded expectations.
Prediction: {'label': 'LABEL_2', 'score': 0.9782863855361938}

Text: The product received negative reviews from customers.
Prediction: {'label': 'LABEL_0', 'score': 0.9852319955825806}



In [86]:
trainer.save_model("/kaggle/working/model")