In [12]:
import pandas as pd
from utils import get_dataset_dir

german_df = pd.read_csv(get_dataset_dir("financial_phrasebank_german"))                       
french_df = pd.read_csv(get_dataset_dir("financial_phrasebank_french"))
spanish_df = pd.read_csv(get_dataset_dir("financial_phrasebank_spanish"))
english_df = pd.read_csv(get_dataset_dir("financial_sentiment_analysis"))


In [13]:
german_df = german_df.dropna()
french_df = french_df.dropna()
spanish_df = spanish_df.dropna()
english_df = english_df.dropna()

In [14]:
german_df["lang"] = "de"
french_df["lang"] = "fr"
spanish_df["lang"] = "es"
english_df["lang"] = "en"

In [15]:
english_df = english_df.rename(columns={"Sentence": "sentence", "Sentiment": "sentiment"})

In [16]:
final_dataset = pd.concat([german_df, french_df, spanish_df, english_df], ignore_index=True)

In [17]:
final_dataset.to_csv(get_dataset_dir("financial_phrasebank_multilingual"), index=False)

# Analyze Language Distribution

Before creating train and test splits, we need to ensure that data is balanced across all four languages (German, French, Spanish, and English) in both sets.

In [18]:
# Check language distribution in the original dataset
print("Language distribution in the dataset:")
lang_distribution = final_dataset['lang'].value_counts()
print(lang_distribution)
print(f"\nTotal samples: {len(final_dataset)}")

# Check sentiment distribution by language
print("\nSentiment distribution by language:")
sentiment_by_lang = final_dataset.groupby(['lang', 'sentiment']).size().unstack()
print(sentiment_by_lang)

Language distribution in the dataset:
lang
de    5842
fr    5842
es    5842
en    5842
Name: count, dtype: int64

Total samples: 23368

Sentiment distribution by language:
sentiment  negative  neutral  positive
lang                                  
de              860     3130      1852
en              860     3130      1852
es              860     3130      1852
fr              860     3130      1852


# Implement Stratified Split by Language

To ensure balanced representation across languages in both train and test sets, we'll use a stratified split based on the 'lang' column.

In [19]:
# Replace the previous simple split with a stratified split
from sklearn.model_selection import train_test_split

# Convert the dataframe to a format suitable for Hugging Face datasets
train_df, test_df = train_test_split(
    final_dataset,
    test_size=0.3,
    random_state=42,
    stratify=final_dataset['lang']  # Stratify by language
)

print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

# Verify language distribution in train and test sets
print("\nLanguage distribution in train set:")
print(train_df['lang'].value_counts())
print("\nLanguage distribution in test set:")
print(test_df['lang'].value_counts())

Train set shape: (16357, 3)
Test set shape: (7011, 3)

Language distribution in train set:
lang
en    4090
fr    4089
es    4089
de    4089
Name: count, dtype: int64

Language distribution in test set:
lang
de    1753
fr    1753
es    1753
en    1752
Name: count, dtype: int64


In [20]:
# Check sentiment distribution in train and test sets
print("Sentiment distribution in train set:")
print(train_df.groupby(['lang', 'sentiment']).size().unstack())
print("\nSentiment distribution in test set:")
print(test_df.groupby(['lang', 'sentiment']).size().unstack())

Sentiment distribution in train set:
sentiment  negative  neutral  positive
lang                                  
de              591     2225      1273
en              617     2205      1268
es              605     2236      1248
fr              571     2160      1358

Sentiment distribution in test set:
sentiment  negative  neutral  positive
lang                                  
de              269      905       579
en              243      925       584
es              255      894       604
fr              289      970       494


# Further Balancing (if needed)

If the languages have very different sample sizes, we might want to balance them by either:
1. Undersampling the majority languages
2. Oversampling the minority languages

Let's implement a function to balance the dataset:

In [21]:
def balance_languages(df, max_samples_per_lang=None):
    """
    Balance the dataset by ensuring equal representation of each language.
    
    Args:
        df: DataFrame containing the multilingual data
        max_samples_per_lang: Maximum samples per language. If None, uses the minimum count across languages.
    
    Returns:
        Balanced DataFrame
    """
    # Get language counts
    lang_counts = df['lang'].value_counts()
    
    # Determine sample size per language
    if max_samples_per_lang is None:
        max_samples_per_lang = lang_counts.min()
    
    # Sample equal amounts from each language
    balanced_dfs = []
    for lang in lang_counts.index:
        lang_df = df[df['lang'] == lang]
        # Ensure we also maintain sentiment distribution within each language
        stratified_sample = lang_df.groupby('sentiment', group_keys=False)
        stratified_sample = stratified_sample.apply(lambda x: x.sample(
            n=min(len(x), int(max_samples_per_lang * len(x) / len(lang_df))),
            random_state=42
        ))
        balanced_dfs.append(stratified_sample)
    
    # Combine all balanced language dataframes
    balanced_df = pd.concat(balanced_dfs, ignore_index=True)
    
    return balanced_df

In [22]:
# Apply balancing to train and test sets (if needed)
# Uncomment these lines if you want to enforce complete balance

# Check if balancing is needed
train_lang_counts = train_df['lang'].value_counts()
test_lang_counts = test_df['lang'].value_counts()

print(f"Before balancing - Train language counts: {train_lang_counts.to_dict()}")
print(f"Before balancing - Test language counts: {test_lang_counts.to_dict()}")

# Apply balancing if there's significant imbalance
if train_lang_counts.max() / train_lang_counts.min() > 1.5:  # Threshold for imbalance
    print("Balancing train dataset...")
    train_df = balance_languages(train_df)
    
if test_lang_counts.max() / test_lang_counts.min() > 1.5:  # Threshold for imbalance
    print("Balancing test dataset...")
    test_df = balance_languages(test_df)
    
print(f"\nAfter balancing - Train language counts: {train_df['lang'].value_counts().to_dict()}")
print(f"After balancing - Test language counts: {test_df['lang'].value_counts().to_dict()}")

Before balancing - Train language counts: {'en': 4090, 'fr': 4089, 'es': 4089, 'de': 4089}
Before balancing - Test language counts: {'de': 1753, 'fr': 1753, 'es': 1753, 'en': 1752}

After balancing - Train language counts: {'en': 4090, 'fr': 4089, 'es': 4089, 'de': 4089}
After balancing - Test language counts: {'de': 1753, 'fr': 1753, 'es': 1753, 'en': 1752}


In [23]:
# Convert pandas dataframes to Hugging Face datasets
import datasets
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Create DatasetDict
dataset_dict = datasets.DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['sentiment', 'sentence', 'lang', '__index_level_0__'],
        num_rows: 16357
    })
    test: Dataset({
        features: ['sentiment', 'sentence', 'lang', '__index_level_0__'],
        num_rows: 7011
    })
})


In [24]:
# Save as CSV files for later use
train_df.to_csv("../data/train_subset.csv", index=False)
test_df.to_csv("../data/eval_subset.csv", index=False)

# Also save to parquet format (more efficient for Hugging Face datasets)
final_dataset.to_parquet(get_dataset_dir("financial_phrasebank_multilingual.parquet"), index=False)

In [25]:
# Push the balanced dataset to Hugging Face Hub
dataset_dict.push_to_hub("nojedag/financial_phrasebank_multilingual")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/nojedag/financial_phrasebank_multilingual/commit/e426e31523a84d438fa2981000c61ef4079192cf', commit_message='Upload dataset', commit_description='', oid='e426e31523a84d438fa2981000c61ef4079192cf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/nojedag/financial_phrasebank_multilingual', endpoint='https://huggingface.co', repo_type='dataset', repo_id='nojedag/financial_phrasebank_multilingual'), pr_revision=None, pr_num=None)