In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
from datasets import load_dataset
data_files = {"train": "data.csv"}
dataset = load_dataset("scherrmann/financial_phrasebank_75agree_german")
german_df = pd.DataFrame(dataset['train'])

In [3]:
import kagglehub

path = kagglehub.dataset_download("arcticgiant/french-financial-news")
french_df = pd.read_csv(f'{path}/FrenchNews.csv')

In [4]:
german_df.isnull().sum()

sentence    0
label       0
dtype: int64

In [5]:
sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
german_df['Sentiment'] = german_df['label'].map(sentiment_map)
german_df.rename(columns={'sentence': 'Sentence'}, inplace=True)
german_df = german_df[['Sentence', 'Sentiment']]
german_df = german_df.dropna()
german_df['Sentiment'].value_counts()

Sentiment
neutral     1717
positive     710
negative     336
Name: count, dtype: int64

In [6]:
german_df.shape[0]

2763

In [7]:
french_df['sentiment_title'] = french_df['Sentiment Vader Title'].apply(lambda x: 'positive' if x > 0.05 else ('negative' if x < -0.05 else 'neutral'))
french_df['sentiment_text'] = french_df['Sentiment Vader Text'].apply(lambda x: 'positive' if x > 0.05 else ('negative' if x < -0.05 else 'neutral'))
french_df['sentiment_url'] = french_df['Sentiment Vader TextURL'].apply(lambda x: 'positive' if x > 0.05 else ('negative' if x < -0.05 else 'neutral'))
french_df_processed = french_df.drop(columns=['Sentiment Vader Title', 'Sentiment Vader Text', 'Sentiment Vader TextURL', 'Numero news', 'Numero page', 'Numero', 'Agency'])
french_df_processed = french_df_processed.rename(columns={"Titre": "title", "Contenu": "text", "URL": "url"})
french_df_processed = french_df_processed[['title','sentiment_title']]
french_df_processed = french_df_processed.rename(columns={"title": "Sentence", "sentiment_title": "Sentiment"})
french_df_processed['Sentiment'].value_counts()

Sentiment
neutral     18507
negative    13188
positive     9848
Name: count, dtype: int64

In [8]:
french_df_processed.shape[0]

41543

In [9]:
sample_size = 2000
french_df_processed = french_df_processed.sample(sample_size, random_state=42)
german_df = german_df.sample(sample_size, random_state=42)
df = pd.concat([german_df, french_df_processed], ignore_index=True)
df['Sentiment'].value_counts()


Sentiment
neutral     2150
positive    1002
negative     848
Name: count, dtype: int64

In [10]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,Der finnische Industriekonzern Ruukki Group Pl...,neutral
1,Honkarakenne Oyj - ein weltweit führender Hers...,neutral
2,Der Preis des Rohmaterials Aluminium ist Ende ...,positive
3,Unsere überlegene Kundenorientierung und unser...,positive
4,Rimvesta wird jetzt von der in estnischem Besi...,neutral


In [11]:
train, eval = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Sentiment'])
train.shape, eval.shape

((3200, 2), (800, 2))

In [12]:
train.to_csv("data/train_subset.csv", index=False)
eval.to_csv("data/eval_subset.csv", index=False)