In [22]:
import sys
import os

# Add the parent directory to the path so Python can find the toolbox package
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    print(f"Added {module_path} to sys.path")

In [23]:
import pandas as pd
import os
from toolbox.utils import get_dataset_dir

In [24]:
dataste_dir = get_dataset_dir("synthetic_financial_sentiment/partial", ext=None, only_dir=True)

# List all files in the dataset directory
files = os.listdir(dataste_dir)
final_dataset = pd.DataFrame()

# Iterate through each file in the directory
for file in files:
    df = pd.read_csv(f'{dataste_dir}/{file}') 
    # Append the DataFrame to the final dataset
    final_dataset = pd.concat([final_dataset, df], ignore_index=True)

label2id = {"neutral": 0, "positive": 1, "negative": 2}
final_dataset['sentiment'] = final_dataset['sentiment'].map(label2id)
final_dataset['sentiment'] = final_dataset['sentiment'].astype(int)
final_dataset = final_dataset[['sentence', 'sentiment', 'lang']]

In [25]:
# Check language distribution in the original dataset
print("Language distribution in the dataset:")
lang_distribution = final_dataset['lang'].value_counts()
print(lang_distribution)
print(f"\nTotal samples: {len(final_dataset)}")

# Check sentiment distribution by language
print("\nSentiment distribution by language:")
sentiment_by_lang = final_dataset.groupby(['lang', 'sentiment']).size().unstack()
print(sentiment_by_lang)

Language distribution in the dataset:
lang
es    97
fr    97
de    93
Name: count, dtype: int64

Total samples: 287

Sentiment distribution by language:
sentiment   0   1   2
lang                 
de         18  36  39
es         18  37  42
fr         18  37  42


In [26]:
# Replace the previous simple split with a stratified split
from sklearn.model_selection import train_test_split

# Convert the dataframe to a format suitable for Hugging Face datasets
train_df, test_df = train_test_split(
    final_dataset,
    test_size=0.3,
    random_state=42,
    stratify=final_dataset['lang']  # Stratify by language
)

print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

# Verify language distribution in train and test sets
print("\nLanguage distribution in train set:")
print(train_df['lang'].value_counts())
print("\nLanguage distribution in test set:")
print(test_df['lang'].value_counts())

Train set shape: (200, 3)
Test set shape: (87, 3)

Language distribution in train set:
lang
fr    68
es    67
de    65
Name: count, dtype: int64

Language distribution in test set:
lang
es    30
fr    29
de    28
Name: count, dtype: int64


In [27]:
# Check sentiment distribution in train and test sets
print("Sentiment distribution in train set:")
print(train_df.groupby(['lang', 'sentiment']).size().unstack())
print("\nSentiment distribution in test set:")
print(test_df.groupby(['lang', 'sentiment']).size().unstack())

Sentiment distribution in train set:
sentiment   0   1   2
lang                 
de         11  26  28
es         14  26  27
fr         11  28  29

Sentiment distribution in test set:
sentiment  0   1   2
lang                
de         7  10  11
es         4  11  15
fr         7   9  13


In [28]:
# Convert pandas dataframes to Hugging Face datasets
import datasets
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

sentiments = ["neutral", "positive", "negative"]
langs = ["en", "fr", "de", "es"]

train_dataset = train_dataset.cast_column("lang", datasets.ClassLabel(names=langs))
test_dataset = test_dataset.cast_column("lang", datasets.ClassLabel(names=langs))
test_dataset = test_dataset.cast_column("sentiment", datasets.ClassLabel(names=sentiments))
train_dataset = train_dataset.cast_column("sentiment", datasets.ClassLabel(names=sentiments))


# Create DatasetDict
dataset_dict = datasets.DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print(dataset_dict)

Casting the dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/87 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/87 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'sentiment', 'lang'],
        num_rows: 200
    })
    test: Dataset({
        features: ['sentence', 'sentiment', 'lang'],
        num_rows: 87
    })
})


In [29]:
# Save as CSV files for later use
train_df.to_csv("../data/train_subset.csv", index=False)
test_df.to_csv("../data/eval_subset.csv", index=False)

# Also save to parquet format (more efficient for Hugging Face datasets)
final_dataset.to_parquet(
    get_dataset_dir("synthetic_financial_sentiment/synthetic_financial_sentiment_multilingual", "parquet"),
    index=False
)

final_dataset.to_csv(
    get_dataset_dir("synthetic_financial_sentiment/synthetic_financial_sentiment_multilingual", "csv"),
    index=False
)

In [30]:
dataset_dict.push_to_hub("nojedag/synthetic_financial_sentiment")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/796 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/nojedag/synthetic_financial_sentiment/commit/5e3c0dae9c83a31bf5857aabc2e4a588b65e8da1', commit_message='Upload dataset', commit_description='', oid='5e3c0dae9c83a31bf5857aabc2e4a588b65e8da1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/nojedag/synthetic_financial_sentiment', endpoint='https://huggingface.co', repo_type='dataset', repo_id='nojedag/synthetic_financial_sentiment'), pr_revision=None, pr_num=None)