In [1]:
import pandas as pd
import os
from utils import get_dataset_dir

In [2]:
dataste_dir = get_dataset_dir("synthetic_financial_sentiment/partial", ext=None, only_dir=True)

# List all files in the dataset directory
files = os.listdir(dataste_dir)
final_dataset = pd.DataFrame()

# Iterate through each file in the directory
for file in files:
    df = pd.read_csv(f'{dataste_dir}/{file}') 
    # Append the DataFrame to the final dataset
    final_dataset = pd.concat([final_dataset, df], ignore_index=True)

In [3]:
# Check language distribution in the original dataset
print("Language distribution in the dataset:")
lang_distribution = final_dataset['lang'].value_counts()
print(lang_distribution)
print(f"\nTotal samples: {len(final_dataset)}")

# Check sentiment distribution by language
print("\nSentiment distribution by language:")
sentiment_by_lang = final_dataset.groupby(['lang', 'sentiment']).size().unstack()
print(sentiment_by_lang)

Language distribution in the dataset:
lang
es    97
fr    97
de    93
Name: count, dtype: int64

Total samples: 287

Sentiment distribution by language:
sentiment  negative  neutral  positive
lang                                  
de               39       18        36
es               42       18        37
fr               42       18        37


In [4]:
# Replace the previous simple split with a stratified split
from sklearn.model_selection import train_test_split

# Convert the dataframe to a format suitable for Hugging Face datasets
train_df, test_df = train_test_split(
    final_dataset,
    test_size=0.3,
    random_state=42,
    stratify=final_dataset['lang']  # Stratify by language
)

print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

# Verify language distribution in train and test sets
print("\nLanguage distribution in train set:")
print(train_df['lang'].value_counts())
print("\nLanguage distribution in test set:")
print(test_df['lang'].value_counts())

Train set shape: (200, 3)
Test set shape: (87, 3)

Language distribution in train set:
lang
fr    68
es    67
de    65
Name: count, dtype: int64

Language distribution in test set:
lang
es    30
fr    29
de    28
Name: count, dtype: int64


In [5]:
# Check sentiment distribution in train and test sets
print("Sentiment distribution in train set:")
print(train_df.groupby(['lang', 'sentiment']).size().unstack())
print("\nSentiment distribution in test set:")
print(test_df.groupby(['lang', 'sentiment']).size().unstack())

Sentiment distribution in train set:
sentiment  negative  neutral  positive
lang                                  
de               28       11        26
es               27       14        26
fr               29       11        28

Sentiment distribution in test set:
sentiment  negative  neutral  positive
lang                                  
de               11        7        10
es               15        4        11
fr               13        7         9


In [6]:
# Convert pandas dataframes to Hugging Face datasets
import datasets
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

# Create DatasetDict
dataset_dict = datasets.DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['sentiment', 'lang', 'sentence'],
        num_rows: 200
    })
    test: Dataset({
        features: ['sentiment', 'lang', 'sentence'],
        num_rows: 87
    })
})


In [7]:
# Save as CSV files for later use
train_df.to_csv("../data/train_subset.csv", index=False)
test_df.to_csv("../data/eval_subset.csv", index=False)

# Also save to parquet format (more efficient for Hugging Face datasets)
final_dataset.to_parquet(
    get_dataset_dir("synthetic_financial_sentiment/synthetic_financial_sentiment_multilingual", "parquet"),
    index=False
)

final_dataset.to_csv(
    get_dataset_dir("synthetic_financial_sentiment/synthetic_financial_sentiment_multilingual", "csv"),
    index=False
)

In [8]:
dataset_dict.push_to_hub("nojedag/synthetic_financial_sentiment")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/nojedag/synthetic_financial_sentiment/commit/ab9d13702195030f8e2655097b56391410f4719f', commit_message='Upload dataset', commit_description='', oid='ab9d13702195030f8e2655097b56391410f4719f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/nojedag/synthetic_financial_sentiment', endpoint='https://huggingface.co', repo_type='dataset', repo_id='nojedag/synthetic_financial_sentiment'), pr_revision=None, pr_num=None)