In [3]:
import sys
import os

# Add the parent directory to the path so Python can find the toolbox package
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    print(f"Added {module_path} to sys.path")

Added e:\repo\DistilBERTFinancialSentiment to sys.path


In [4]:
import pandas as pd
from toolbox.utils import get_dataset_dir

german_df = pd.read_csv(get_dataset_dir("financial_phrasebank_multilingual/partial/financial_phrasebank_german"))                       
french_df = pd.read_csv(get_dataset_dir("financial_phrasebank_multilingual/partial/financial_phrasebank_french"))
spanish_df = pd.read_csv(get_dataset_dir("financial_phrasebank_multilingual/partial/financial_phrasebank_spanish"))
english_df = pd.read_csv(get_dataset_dir("financial_phrasebank_multilingual/partial/financial_phrasebank_english"))


In [5]:
german_df = german_df.dropna()
french_df = french_df.dropna()
spanish_df = spanish_df.dropna()
english_df = english_df.dropna()

In [6]:
german_df["lang"] = "de"
french_df["lang"] = "fr"
spanish_df["lang"] = "es"
english_df["lang"] = "en"

In [7]:
english_df = english_df.rename(columns={"Sentence": "sentence", "Sentiment": "sentiment"})

In [8]:
final_dataset = pd.concat([german_df, french_df, spanish_df, english_df], ignore_index=True)

In [9]:
# Check language distribution in the original dataset
print("Language distribution in the dataset:")
lang_distribution = final_dataset['lang'].value_counts()
print(lang_distribution)
print(f"\nTotal samples: {len(final_dataset)}")

# Check sentiment distribution by language
print("\nSentiment distribution by language:")
sentiment_by_lang = final_dataset.groupby(['lang', 'sentiment']).size().unstack()
print(sentiment_by_lang)

Language distribution in the dataset:
lang
de    5842
fr    5842
es    5842
en    5842
Name: count, dtype: int64

Total samples: 23368

Sentiment distribution by language:
sentiment  negative  neutral  positive
lang                                  
de              860     3130      1852
en              860     3130      1852
es              860     3130      1852
fr              860     3130      1852


In [10]:
# Replace the previous simple split with a stratified split
from sklearn.model_selection import train_test_split

# Convert the dataframe to a format suitable for Hugging Face datasets
train_df, test_df = train_test_split(
    final_dataset,
    test_size=0.3,
    random_state=42,
    stratify=final_dataset['lang']  # Stratify by language
)

print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

# Verify language distribution in train and test sets
print("\nLanguage distribution in train set:")
print(train_df['lang'].value_counts())
print("\nLanguage distribution in test set:")
print(test_df['lang'].value_counts())

Train set shape: (16357, 3)
Test set shape: (7011, 3)

Language distribution in train set:
lang
en    4090
fr    4089
es    4089
de    4089
Name: count, dtype: int64

Language distribution in test set:
lang
de    1753
fr    1753
es    1753
en    1752
Name: count, dtype: int64


In [11]:
# Check sentiment distribution in train and test sets
print("Sentiment distribution in train set:")
print(train_df.groupby(['lang', 'sentiment']).size().unstack())
print("\nSentiment distribution in test set:")
print(test_df.groupby(['lang', 'sentiment']).size().unstack())

Sentiment distribution in train set:
sentiment  negative  neutral  positive
lang                                  
de              591     2225      1273
en              617     2205      1268
es              605     2236      1248
fr              571     2160      1358

Sentiment distribution in test set:
sentiment  negative  neutral  positive
lang                                  
de              269      905       579
en              243      925       584
es              255      894       604
fr              289      970       494


In [12]:
# Convert pandas dataframes to Hugging Face datasets
import datasets
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

# Create DatasetDict
dataset_dict = datasets.DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['sentiment', 'sentence', 'lang'],
        num_rows: 16357
    })
    test: Dataset({
        features: ['sentiment', 'sentence', 'lang'],
        num_rows: 7011
    })
})



In [13]:
# Save as CSV files for later use
train_df.to_csv("../data/train_subset.csv", index=False)
test_df.to_csv("../data/eval_subset.csv", index=False)

# Also save to parquet format (more efficient for Hugging Face datasets)
final_dataset.to_parquet(
    get_dataset_dir("financial_phrasebank_multilingual/financial_phrasebank_multilingual", "parquet"),
    index=False
)

final_dataset.to_csv(
    get_dataset_dir("financial_phrasebank_multilingual/financial_phrasebank_multilingual", "csv"),
    index=False
)

In [14]:
# Push the balanced dataset to Hugging Face Hub
dataset_dict.push_to_hub("nojedag/financial_phrasebank_multilingual")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/nojedag/financial_phrasebank_multilingual/commit/62b59f31a93d87fd0696c7171a1f742b5a1b2cff', commit_message='Upload dataset', commit_description='', oid='62b59f31a93d87fd0696c7171a1f742b5a1b2cff', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/nojedag/financial_phrasebank_multilingual', endpoint='https://huggingface.co', repo_type='dataset', repo_id='nojedag/financial_phrasebank_multilingual'), pr_revision=None, pr_num=None)