In [1]:
import sys
import os

# Add the parent directory to the path so Python can find the toolbox package
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    print(f"Added {module_path} to sys.path")

Added e:\repo\DistilBERTFinancialSentiment to sys.path


In [2]:
import pandas as pd
from toolbox.utils import get_dataset_dir

german_df = pd.read_csv(get_dataset_dir("financial_phrasebank_multilingual/partial/financial_phrasebank_german"))                       
french_df = pd.read_csv(get_dataset_dir("financial_phrasebank_multilingual/partial/financial_phrasebank_french"))
spanish_df = pd.read_csv(get_dataset_dir("financial_phrasebank_multilingual/partial/financial_phrasebank_spanish"))
english_df = pd.read_csv(get_dataset_dir("financial_phrasebank_multilingual/partial/financial_phrasebank_english"))


In [3]:
german_df = german_df.dropna()
french_df = french_df.dropna()
spanish_df = spanish_df.dropna()
english_df = english_df.dropna()

In [4]:
german_df["lang"] = "de"
french_df["lang"] = "fr"
spanish_df["lang"] = "es"
english_df["lang"] = "en"

In [5]:
english_df = english_df.rename(columns={"Sentence": "sentence", "Sentiment": "sentiment"})

In [6]:
final_dataset = pd.concat([german_df, french_df, spanish_df, english_df], ignore_index=True)
label2id = {"neutral": 0, "positive": 1, "negative": 2}
final_dataset["sentiment"] = final_dataset["sentiment"].map(label2id)
final_dataset["labels"] = final_dataset["sentiment"]
final_dataset = final_dataset[['sentence', 'labels', 'lang']]

In [7]:
# Check language distribution in the original dataset
print("Language distribution in the dataset:")
lang_distribution = final_dataset['lang'].value_counts()
print(lang_distribution)
print(f"\nTotal samples: {len(final_dataset)}")

# Check sentiment distribution by language
print("\nSentiment distribution by language:")
sentiment_by_lang = final_dataset.groupby(['lang', 'labels']).size().unstack()
print(sentiment_by_lang)

Language distribution in the dataset:
lang
de    5842
fr    5842
es    5842
en    5842
Name: count, dtype: int64

Total samples: 23368

Sentiment distribution by language:
labels     0     1    2
lang                   
de      3130  1852  860
en      3130  1852  860
es      3130  1852  860
fr      3130  1852  860


In [8]:
# Replace the previous simple split with a stratified split
from sklearn.model_selection import train_test_split

# Convert the dataframe to a format suitable for Hugging Face datasets
train_df, test_df = train_test_split(
    final_dataset,
    test_size=0.3,
    random_state=42,
    stratify=final_dataset['lang']  # Stratify by language
)

print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

# Verify language distribution in train and test sets
print("\nLanguage distribution in train set:")
print(train_df['lang'].value_counts())
print("\nLanguage distribution in test set:")
print(test_df['lang'].value_counts())

Train set shape: (16357, 3)
Test set shape: (7011, 3)

Language distribution in train set:
lang
en    4090
fr    4089
es    4089
de    4089
Name: count, dtype: int64

Language distribution in test set:
lang
de    1753
fr    1753
es    1753
en    1752
Name: count, dtype: int64


In [9]:
# Check sentiment distribution in train and test sets
print("Sentiment distribution in train set:")
print(train_df.groupby(['lang', 'labels']).size().unstack())
print("\nSentiment distribution in test set:")
print(test_df.groupby(['lang', 'labels']).size().unstack())

Sentiment distribution in train set:
labels     0     1    2
lang                   
de      2225  1273  591
en      2205  1268  617
es      2236  1248  605
fr      2160  1358  571

Sentiment distribution in test set:
labels    0    1    2
lang                 
de      905  579  269
en      925  584  243
es      894  604  255
fr      970  494  289


In [10]:
# Convert pandas dataframes to Hugging Face datasets
import datasets
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

sentiments = ["neutral", "positive", "negative"]
langs = ["en", "fr", "de", "es"]

train_dataset = train_dataset.cast_column("lang", datasets.ClassLabel(names=langs))
test_dataset = test_dataset.cast_column("lang", datasets.ClassLabel(names=langs))
test_dataset = test_dataset.cast_column("labels", datasets.ClassLabel(names=sentiments))
train_dataset = train_dataset.cast_column("labels", datasets.ClassLabel(names=sentiments))

# Create DatasetDict
dataset_dict = datasets.DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print(dataset_dict)

Casting the dataset:   0%|          | 0/16357 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7011 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7011 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/16357 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'labels', 'lang'],
        num_rows: 16357
    })
    test: Dataset({
        features: ['sentence', 'labels', 'lang'],
        num_rows: 7011
    })
})


In [11]:
# Save as CSV files for later use
train_df.to_csv("../data/train_subset.csv", index=False)
test_df.to_csv("../data/eval_subset.csv", index=False)

# Also save to parquet format (more efficient for Hugging Face datasets)
final_dataset.to_parquet(
    get_dataset_dir("financial_phrasebank_multilingual/financial_phrasebank_multilingual", "parquet"),
    index=False
)

final_dataset.to_csv(
    get_dataset_dir("financial_phrasebank_multilingual/financial_phrasebank_multilingual", "csv"),
    index=False
)

In [12]:
# Push the balanced dataset to Hugging Face Hub
dataset_dict.push_to_hub("nojedag/financial_phrasebank_multilingual")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/nojedag/financial_phrasebank_multilingual/commit/9bf00435b9e0c513816716a893fe9b1d95c19fb5', commit_message='Upload dataset', commit_description='', oid='9bf00435b9e0c513816716a893fe9b1d95c19fb5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/nojedag/financial_phrasebank_multilingual', endpoint='https://huggingface.co', repo_type='dataset', repo_id='nojedag/financial_phrasebank_multilingual'), pr_revision=None, pr_num=None)