In [1]:
import sys
import os

# Add the parent directory to the path so Python can find the toolbox package
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    print(f"Added {module_path} to sys.path")

Added e:\repo\DistilBERTFinancialSentiment to sys.path


In [2]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("TimKoornstra/financial-tweets-sentiment")

In [3]:
ds_df = ds["train"].to_pandas()
ds_df["sentence"] = ds_df["tweet"].astype(str)
ds_df.drop(columns=["tweet", "url"], inplace=True)
ds_df["lang"] = "en"

# Remove urls from the text
def remove_urls(text):
    return ' '.join(word for word in text.split() if not word.startswith('http'))

# remove retweets
def remove_retweets(text):
    return text.split("RT @")[0].strip()

# remove strings larger than 512 characters
def remove_large_strings(text):
    return text if len(text) <= 512 else text[:512]

ds_df["sentence"] = ds_df["sentence"].apply(remove_urls)
ds_df["sentence"] = ds_df["sentence"].apply(remove_retweets)
ds_df["sentence"] = ds_df["sentence"].apply(remove_large_strings)

In [4]:
from transformers import pipeline

pipe_en_fr = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr", device=0, batch_size=32, truncation=True)
pipe_en_de = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de", device=0, batch_size=32, truncation=True)
pipe_en_es = pipeline("translation", model="Helsinki-NLP/opus-mt-en-es", device=0, batch_size=32, truncation=True)

pipe_collection = {
    "fr": pipe_en_fr,
    "de": pipe_en_de,
    "es": pipe_en_es,
}

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


In [5]:
def split_text(text, tokenizer, max_tokens=512):
    tokens = tokenizer.encode(text, truncation=False)
    chunks = [tokens[i:i+max_tokens] for i in range(0, len(tokens), max_tokens)]
    return [tokenizer.decode(chunk) for chunk in chunks]

In [6]:
result_df = ds_df.copy()

for lang, pipe in pipe_collection.items():
    lang_df = ds_df.copy()
    sentence_list = ds_df["sentence"].tolist()
    translated_sentences = pipe(sentence_list)
    translated_sentences = [sentence["translation_text"] for sentence in translated_sentences]
    lang_df["sentence"] = translated_sentences
    lang_df["lang"] = lang
    result_df = pd.concat([result_df, lang_df], ignore_index=True)


In [7]:
print("Language distribution in the dataset:")
lang_distribution = result_df['lang'].value_counts()
print(lang_distribution)
print(f"\nTotal samples: {len(result_df)}")

# Check sentiment distribution by language
print("\nSentiment distribution by language:")
sentiment_by_lang = result_df.groupby(['lang', 'sentiment']).size().unstack()
print(sentiment_by_lang)

Language distribution in the dataset:
lang
en    38091
fr    38091
de    38091
es    38091
Name: count, dtype: int64

Total samples: 152364

Sentiment distribution by language:
sentiment      0      1     2
lang                         
de         12181  17368  8542
en         12181  17368  8542
es         12181  17368  8542
fr         12181  17368  8542


In [8]:
# Replace the previous simple split with a stratified split
from sklearn.model_selection import train_test_split

# Convert the dataframe to a format suitable for Hugging Face datasets
train_df, test_df = train_test_split(
    result_df,
    test_size=0.3,
    random_state=42,
    stratify=result_df['lang']  # Stratify by language
)

print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

# Verify language distribution in train and test sets
print("\nLanguage distribution in train set:")
print(train_df['lang'].value_counts())
print("\nLanguage distribution in test set:")
print(test_df['lang'].value_counts())

Train set shape: (106654, 3)
Test set shape: (45710, 3)

Language distribution in train set:
lang
en    26664
fr    26664
de    26663
es    26663
Name: count, dtype: int64

Language distribution in test set:
lang
de    11428
es    11428
en    11427
fr    11427
Name: count, dtype: int64


In [9]:
# Check sentiment distribution in train and test sets
print("Sentiment distribution in train set:")
print(train_df.groupby(['lang', 'sentiment']).size().unstack())
print("\nSentiment distribution in test set:")
print(test_df.groupby(['lang', 'sentiment']).size().unstack())

Sentiment distribution in train set:
sentiment     0      1     2
lang                        
de         8472  12181  6010
en         8573  12068  6023
es         8543  12139  5981
fr         8486  12192  5986

Sentiment distribution in test set:
sentiment     0     1     2
lang                       
de         3709  5187  2532
en         3608  5300  2519
es         3638  5229  2561
fr         3695  5176  2556


In [10]:
import datasets
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df, preserve_index=False)

test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

sentiments = ["neutral", "positive", "negative"]
langs = ["en", "fr", "de", "es"]

train_dataset = train_dataset.cast_column("lang", datasets.ClassLabel(names=langs))
test_dataset = test_dataset.cast_column("lang", datasets.ClassLabel(names=langs))
test_dataset = test_dataset.cast_column("sentiment", datasets.ClassLabel(names=sentiments))
train_dataset = train_dataset.cast_column("sentiment", datasets.ClassLabel(names=sentiments))

# Create DatasetDict
dataset_dict = datasets.DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print(dataset_dict)

Casting the dataset:   0%|          | 0/106654 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/45710 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/45710 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/106654 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentiment', 'sentence', 'lang'],
        num_rows: 106654
    })
    test: Dataset({
        features: ['sentiment', 'sentence', 'lang'],
        num_rows: 45710
    })
})


In [11]:
# Calculate the class weights
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Assuming 'sentiment' is the column with the labels
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(result_df['sentiment']),
    y=result_df['sentiment']
)

# Convert class weights to a dictionary
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}
# Print the class weights so they can be copied and pasted into the code
# or used directly in the training script
print("\nClass weights for each sentiment class:")
for sentiment, weight in class_weights_dict.items():
    print(f"Sentiment {sentiment}: {weight:.4f}")


Class weights for each sentiment class:
Sentiment 0: 1.0424
Sentiment 1: 0.7311
Sentiment 2: 1.4864


In [12]:
from toolbox.utils import get_dataset_dir

# Save as CSV files for later use
train_df.to_csv("../data/_new/train_subset.csv", index=False)
test_df.to_csv("../data/_new/eval_subset.csv", index=False)

# Also save to parquet format (more efficient for Hugging Face datasets)
result_df.to_parquet(
    get_dataset_dir("financial_phrasebank_multilingual/financial_phrasebank_multilingual", "parquet"),
    index=False
)

result_df.to_csv(
    get_dataset_dir("financial_phrasebank_multilingual/financial_phrasebank_multilingual", "csv"),
    index=False
)

In [13]:
dataset_dict.push_to_hub("nojedag/financial-tweets-sentiment-multilingual")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/107 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/641 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/nojedag/financial-tweets-sentiment-multilingual/commit/dc0bb890834565db17c34a9180cbd4ce2d6c0f49', commit_message='Upload dataset', commit_description='', oid='dc0bb890834565db17c34a9180cbd4ce2d6c0f49', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/nojedag/financial-tweets-sentiment-multilingual', endpoint='https://huggingface.co', repo_type='dataset', repo_id='nojedag/financial-tweets-sentiment-multilingual'), pr_revision=None, pr_num=None)