In [78]:
import pandas as pd
import os
from functools import reduce
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)


In [68]:
# --- Load Individual DataFrames ---
data_path = '..\\data'
input_hdf_path = os.path.join(data_path, 'cleaned_data.h5')

df_keys = [
    'jpmx', 
    'resource_energy', 
    'resource_metals', 
    'resource_agrar',
    'country_risk', 
    'bbg_zinsen', 
    'bbg_spreads', 
    'bbg_inflation',
    'bbg_btc', 
    'bbg_pmi', 
    'bbg_eps', 
    'bbg_surprise_index', 
    'bbg_dollar_index', 
    'bbg_value_growth'
]

all_dfs_dict = {key: pd.read_hdf(input_hdf_path, key=key) for key in df_keys}

print("Following DataFrames were successfully loaded:")
for name, df in all_dfs_dict.items():
    print(f"- {name}: {df.shape}")

all_dfs = list(all_dfs_dict.values())

Following DataFrames were successfully loaded:
- jpmx: (5139, 9)
- resource_energy: (5316, 7)
- resource_metals: (6221, 14)
- resource_agrar: (5190, 8)
- country_risk: (66, 329)
- bbg_zinsen: (5369, 6)
- bbg_spreads: (5341, 3)
- bbg_inflation: (5364, 3)
- bbg_btc: (5369, 3)
- bbg_pmi: (5389, 4)
- bbg_eps: (5242, 2)
- bbg_surprise_index: (5328, 4)
- bbg_dollar_index: (5365, 2)
- bbg_value_growth: (5369, 3)


In [69]:
# Merge all dataframes in the list based on the 'Date' column
main_df = reduce(lambda left, right: pd.merge(left, right, on='Date', how='outer'), all_dfs)

# --- Post-Merge Cleaning and Filtering ---
main_df.sort_values(by='Date', inplace=True)
main_df.drop_duplicates(subset=['Date'], keep='first', inplace=True)

start_date = '2005-01-03'
end_date = '2025-07-31'
main_df = main_df[(main_df['Date'] >= start_date) & (main_df['Date'] <= end_date)]

main_df.reset_index(drop=True, inplace=True)

# Set Date as Index
main_df.set_index('Date', inplace=True)

all_days = pd.date_range(start=main_df.index.min(), end=main_df.index.max(), freq='D')
main_df_resampled = main_df.reindex(all_days)
main_df_filled = main_df_resampled.ffill()
main_df_filled = main_df_filled.infer_objects(copy=False)

# Reset Index to 'Date' is a normal column again
main_df_filled.reset_index(inplace=True)
main_df_filled.rename(columns={'index': 'Date'}, inplace=True)

print("Merged dataframe with missing days:", main_df.shape)
print("Merged dataframe after filling missing days:", main_df_filled.shape)

main_df_filled.info()
display(main_df_filled.loc[main_df_filled['Date'] == '2009-06-30', 'Argentina_Country_Risk_RSSCAROR_Index'])
display(main_df_filled[main_df_filled['Date'].between('2025-01-03', '2025-01-06')])

Merged dataframe with missing days: (6255, 383)
Merged dataframe after filling missing days: (7515, 384)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7515 entries, 0 to 7514
Columns: 384 entries, Date to MSCI_ACWI_Growth_MXWD000G_Index
dtypes: datetime64[ns](1), float64(383)
memory usage: 22.0 MB


  main_df_filled = main_df_resampled.ffill()


1639    38.97
Name: Argentina_Country_Risk_RSSCAROR_Index, dtype: float64

Unnamed: 0,Date,EMBIG_Div_JPGCSOSD_Index.1,High_Grade_JPGCIGSS_Index.1,High_Yield_JPGCHYSS_Index.1,Africa_JPSSGDAF_Index.1,Asia_JPSSGDAS_Index.1,Europe_JPSSGDER_Index.1,Latin_America_JPSSGDLA_Index.1,Middle_East_JPSSGDME_Index.1,NYM_WTI_Rohöl_CL1_COMB_Comdty.1,...,US_PMI_Manufacturing_NAPMPMI_Index_Actual_Economic_Release_Values_ACTUAL_RELEASE,US_PMI_Services_NAPMNMI_Index_Actual_Economic_Release_Values_ACTUAL_RELEASE,Ifo_Index_GRIFPBUS_Index_Actual_Economic_Release_Values_ACTUAL_RELEASE,Earnings_per_Share_Forward_12M_MSCI_ACWI_BEst_EPS,Economic_Surprise_EU_CESIEUR_Index,Economic_Surprise_US_CESIUSD_Index,Economic_Surprise_China_CESICNY_Index,DXY_Curncy,MSCI_ACWI_Value_MXWD000V_Index,MSCI_ACWI_Growth_MXWD000G_Index
7305,2025-01-03,320.279,121.69,545.885,447.0,125.0,248.0,419.0,265.0,73.96,...,49.3,54.1,84.7,46.5512,-12.4,5.7,15.4,108.952,275.12,591.39
7306,2025-01-04,320.279,121.69,545.885,447.0,125.0,248.0,419.0,265.0,73.96,...,49.3,54.1,84.7,46.5512,-12.4,5.7,15.4,108.952,275.12,591.39
7307,2025-01-05,320.279,121.69,545.885,447.0,125.0,248.0,419.0,265.0,73.96,...,49.3,54.1,84.7,46.5512,-12.4,5.7,15.4,108.952,275.12,591.39
7308,2025-01-06,320.312,124.347,541.95,440.0,130.0,251.0,415.0,267.0,73.56,...,49.3,54.1,84.7,46.6131,-4.3,2.4,16.1,108.257,275.3,598.58


In [70]:
display(main_df_filled.columns.values)
display(main_df_filled.shape)

array(['Date', 'EMBIG_Div_JPGCSOSD_Index.1',
       'High_Grade_JPGCIGSS_Index.1', 'High_Yield_JPGCHYSS_Index.1',
       'Africa_JPSSGDAF_Index.1', 'Asia_JPSSGDAS_Index.1',
       'Europe_JPSSGDER_Index.1', 'Latin_America_JPSSGDLA_Index.1',
       'Middle_East_JPSSGDME_Index.1', 'NYM_WTI_Rohöl_CL1_COMB_Comdty.1',
       'ICE_Brent_Rohöl_CO1_Comdty.1', 'NYM_Heizöl_HO1_COMB_Comdty.1',
       'ICE_Gasöl_QS1_Comdty.1', 'NYM_Erdgas_NG1_COMB_Comdty.1',
       'Gas_Europa_TZT1_Comdty.1', 'Gold_GC1_COMB_Comdty.1',
       'Silver_SI1_COMB_Comdty.1', 'Platinum_PL1_COMB_Comdty.1',
       'Aluminum_LA1_Comdty.1', 'Kupfer_HG1_COMB_Comdty.1',
       'Blei_LL1_Comdty.1', 'Nickel_LN1_Comdty.1', 'Zink_LX1_Comdty.1',
       'Cobalt_LMCODY_LME_Comdty.1', 'Eisenerz_IOE1_COMB_Comdty.1',
       'Uran_UXA1_Comdty.1', 'Bitcoin_XBTUSD_BGN_Curncy.1',
       'Zinn_LT1_Comdty.1', 'Mais_C_1_COMB_Comdty.1',
       'Sojabohnen_S_1_COMB_Comdty.1', 'Weizen_W_1_Comdty.1',
       'Zucker_SB1_Comdty.1', 'Kakao_CC1_Comdty

(7515, 384)

In [71]:
# Reshape Dataframe to Long Format, rename columns and apply melt
region_spread_columns = {
    'Africa_JPSSGDAF_Index.1': 'Africa',
    'Asia_JPSSGDAS_Index.1': 'Asia',
    'Europe_JPSSGDER_Index.1': 'Europe',
    'Latin_America_JPSSGDLA_Index.1': 'Latin_America',
    'Middle_East_JPSSGDME_Index.1': 'Middle_East'
}

df_renamed = main_df_filled.rename(columns=region_spread_columns)

id_vars = [col for col in df_renamed.columns if col not in region_spread_columns.values()]

long_df = pd.melt(df_renamed,
                  id_vars=id_vars,
                  value_vars=region_spread_columns.values(),
                  var_name='Region',
                  value_name='Risikoaufschlag')

# Reorder columns to place new columns to the front 
new_column_order = ['Date', 'Region', 'Risikoaufschlag'] + [col for col in id_vars if col != 'Date']
long_df = long_df[new_column_order]

# Expected Format: 7515 * 5 = 37575
print("Shape after transforming to long format:", long_df.shape)
display(long_df.head())

Shape after transforming to long format: (37575, 381)


Unnamed: 0,Date,Region,Risikoaufschlag,EMBIG_Div_JPGCSOSD_Index.1,High_Grade_JPGCIGSS_Index.1,High_Yield_JPGCHYSS_Index.1,NYM_WTI_Rohöl_CL1_COMB_Comdty.1,ICE_Brent_Rohöl_CO1_Comdty.1,NYM_Heizöl_HO1_COMB_Comdty.1,ICE_Gasöl_QS1_Comdty.1,...,US_PMI_Manufacturing_NAPMPMI_Index_Actual_Economic_Release_Values_ACTUAL_RELEASE,US_PMI_Services_NAPMNMI_Index_Actual_Economic_Release_Values_ACTUAL_RELEASE,Ifo_Index_GRIFPBUS_Index_Actual_Economic_Release_Values_ACTUAL_RELEASE,Earnings_per_Share_Forward_12M_MSCI_ACWI_BEst_EPS,Economic_Surprise_EU_CESIEUR_Index,Economic_Surprise_US_CESIUSD_Index,Economic_Surprise_China_CESICNY_Index,DXY_Curncy,MSCI_ACWI_Value_MXWD000V_Index,MSCI_ACWI_Growth_MXWD000G_Index
0,2005-01-03,Africa,286.0,371.522,150.318,521.822,42.12,,119.22,,...,,,,,-10.7,22.0,11.8,81.3,148.75,127.64
1,2005-01-04,Africa,273.0,370.689,147.913,522.278,43.91,41.04,124.66,367.5,...,58.6,,96.2,,-9.7,22.5,11.5,82.52,147.27,126.03
2,2005-01-05,Africa,274.0,378.834,153.579,532.467,43.39,40.51,121.84,361.5,...,58.6,,96.2,,-8.4,27.6,11.2,82.54,146.38,125.23
3,2005-01-06,Africa,276.0,382.809,154.331,539.133,45.56,42.85,128.13,378.5,...,58.6,,96.2,,-7.6,27.7,10.9,83.15,146.42,124.99
4,2005-01-07,Africa,272.0,380.195,150.906,537.202,45.43,43.18,127.33,377.25,...,58.6,,96.2,,-18.4,26.0,10.6,83.6,146.0,124.92


In [None]:
country_to_region_map = {
    # Afrika
    'Egypt': 'Africa',
    'Ghana': 'Africa',
    'Kenya': 'Africa',
    'Morocco': 'Africa',
    'Nigeria': 'Africa',
    'South_Africa': 'Africa',
    'Tanzania': 'Africa',
    'Tunisia': 'Africa',
    
    # Asien
    'China': 'Asia',
    'India': 'Asia',
    'Indonesia': 'Asia',
    'South_Korea': 'Asia',
    'Hong_Kong': 'Asia',
    'Japan': 'Asia',
    'Kazakhstan': 'Asia',
    'Malaysia': 'Asia',
    'Mongolia': 'Asia',
    'Pakistan': 'Asia',
    'Philippines': 'Asia',
    'Singapore': 'Asia',
    'Sri_Lanka': 'Asia',
    'Taiwan': 'Asia',
    'Thailand': 'Asia',
    'Vietnam': 'Asia',

    # Lateinamerika
    'Argentina': 'Latin_America',
    'Bolivia': 'Latin_America',
    'Brazil': 'Latin_America',
    'Chile': 'Latin_America',
    'Colombia': 'Latin_America',
    'Costa_Rica': 'Latin_America',
    'Dominican_Republic': 'Latin_America',
    'Ecuador': 'Latin_America',
    'El_Salvador': 'Latin_America',
    'Guatemala': 'Latin_America',
    'Honduras': 'Latin_America',
    'Jamaica': 'Latin_America',
    'Mexico': 'Latin_America',
    'Panama': 'Latin_America',
    'Peru': 'Latin_America',
    'Uruguay': 'Latin_America',
    'Venezuela': 'Latin_America',
    
    # Naher Osten
    'Bahrain': 'Middle_East',
    'Israel': 'Middle_East',
    'Jordan': 'Middle_East',
    'Qatar': 'Middle_East',
    'Saudi_Arabia': 'Middle_East',
    'Turkey': 'Middle_East', ##################???
    'United_Arab_Emirates': 'Middle_East',

    # Osteuropa
    'Bulgaria': 'Eastern_Europe',
    'Croatia': 'Eastern_Europe',
    'Czech_Republic': 'Eastern_Europe',
    'Hungary': 'Eastern_Europe',
    'Latvia': 'Eastern_Europe',
    'Lithuania': 'Eastern_Europe',
    'Poland': 'Eastern_Europe',
    'Romania': 'Eastern_Europe',
    'Russia': 'Eastern_Europe',
    'Slovakia': 'Eastern_Europe',
    'Slovenia': 'Eastern_Europe',
    'Ukraine': 'Eastern_Europe'
}

final_df = long_df.copy()

# --- Aggregation of Country-Risk-Indices ---
country_cols_to_drop = []
risk_types = ['Country_Risk', 'Financial_Risk', 'Economic_Risk', 'Political_Risk']

for region_name in set(country_to_region_map.values()):

    # Find countries in this region
    countries_in_region = [country for country, region in country_to_region_map.items() if region == region_name]

    # Iterate risk type and find relevant columns
    for risk in risk_types:
        relevant_cols = [col for col in final_df.columns if any(f"{country}_{risk}" in col for country in countries_in_region)]

        # Add to drop list
        for col in relevant_cols:
            if col not in country_cols_to_drop:
                country_cols_to_drop.append(col)
        
        if not relevant_cols:
            continue

        # Calculate new aggregated columns
        base_new_col_name = f"{region_name}_{risk}"

        final_df[f"{base_new_col_name}_Avg"] = final_df[relevant_cols].mean(axis=1)
        final_df[f"{base_new_col_name}_Max"] = final_df[relevant_cols].max(axis=1)
        final_df[f"{base_new_col_name}_Std"] = final_df[relevant_cols].std(axis=1)
        final_df[f"{base_new_col_name}_Median"] = final_df[relevant_cols].median(axis=1)
        final_df[f"{base_new_col_name}_Q1"] = final_df[relevant_cols].quantile(q=0.25, axis=1) # 25% Quantil
        final_df[f"{base_new_col_name}_Q3"] = final_df[relevant_cols].quantile(q=0.75, axis=1) # 75% Quantil
        final_df[f"{base_new_col_name}_Skew"] = final_df[relevant_cols].skew(axis=1)
        final_df[f"{base_new_col_name}_Kurt"] = final_df[relevant_cols].kurt(axis=1)
        final_df[f"{base_new_col_name}_Count"] = final_df[relevant_cols].count(axis=1)

# Drop redundant country columns
final_df.drop(columns=country_cols_to_drop, inplace=True)
final_df = final_df.copy()

print("Shape des finalen DataFrames:", final_df.shape)
print("New Columns example:", [col for col in final_df.columns if 'Africa' in col and 'Avg' in col])
display(final_df.loc[final_df['Date'] == '2009-06-30', 'Africa_Country_Risk_Avg'])
display(final_df.head())


In [73]:
country_risk_columns = [
'Argentina_Country_Risk_RSSCAROR_Index',
'Australia_Country_Risk_RSSCAUOR_Index',
'Austria_Country_Risk_RSSCASOR_Index',
'Bahrain_Country_Risk_RSSCBHOR_Index',
'Belgium_Country_Risk_RSSCBEOR_Index',
'Bolivia_Country_Risk_RSSCBOOR_Index',
'Brazil_Country_Risk_RSSCBROR_Index',
'Bulgaria_Country_Risk_RSSCBUOR_Index',
'Canada_Country_Risk_RSSCCAOR_Index',
'Chile_Country_Risk_RSSCCHOR_Index',
'China_Country_Risk_RSSCCNOR_Index',
'Colombia_Country_Risk_RSSCCOOR_Index',
'Costa_Rica_Country_Risk_RSSCCROR_Index',
'Croatia_Country_Risk_RSSCCTOR_Index',
'Cyprus_Country_Risk_RSSCCYOR_Index',
'Czech_Republic_Country_Risk_RSSCCZOR_Index',
'Denmark_Country_Risk_RSSCDNOR_Index',
'Dominican_Republic_Country_Risk_RSSCDROR_Index',
'Ecuador_Country_Risk_RSSCECOR_Index',
'Egypt_Country_Risk_RSSCEGOR_Index',
'El_Salvador_Country_Risk_RSSCESOR_Index',
'Finland_Country_Risk_RSSCFNOR_Index',
'France_Country_Risk_RSSCFROR_Index',
'Germany_Country_Risk_RSSCGEOR_Index',
'Ghana_Country_Risk_RSSCGHOR_Index',
'Greece_Country_Risk_RSSCGCOR_Index',
'Guatemala_Country_Risk_RSSCGUOR_Index',
'Honduras_Country_Risk_RSSCHOOR_Index',
'Hong_Kong_Country_Risk_RSSCHKOR_Index',
'Hungary_Country_Risk_RSSCHNOR_Index',
'India_Country_Risk_RSSCINOR_Index',
'Indonesia_Country_Risk_RSSCIDOR_Index',
'Ireland_Country_Risk_RSSCIROR_Index',
'Israel_Country_Risk_RSSCISOR_Index',
'Italy_Country_Risk_RSSCITOR_Index',
'Jamaica_Country_Risk_RSSCJMOR_Index',
'Japan_Country_Risk_RSSCJPOR_Index',
'Jordan_Country_Risk_RSSCJROR_Index',
'Kazakhstan_Country_Risk_RSSCKZOR_Index',
'Kenya_Country_Risk_RSSCKEOR_Index',
'Latvia_Country_Risk_RSSCLTOR_Index',
'Lithuania_Country_Risk_RSSCLIOR_Index',
'Luxembourg_Country_Risk_RSSCLXOR_Index',
'Malaysia_Country_Risk_RSSCMLOR_Index',
'Mexico_Country_Risk_RSSCMXOR_Index',
'Mongolia_Country_Risk_RSSCMNOR_Index',
'Morocco_Country_Risk_RSSCMROR_Index',
'Netherlands_Country_Risk_RSSCNEOR_Index',
'New_Zealand_Country_Risk_RSSCNZOR_Index',
'Nigeria_Country_Risk_RSSCNGOR_Index',
'Norway_Country_Risk_RSSCNROR_Index',
'Pakistan_Country_Risk_RSSCPKOR_Index',
'Panama_Country_Risk_RSSCPNOR_Index',
'Peru_Country_Risk_RSSCPEOR_Index',
'Philippines_Country_Risk_RSSCPHOR_Index',
'Poland_Country_Risk_RSSCPOOR_Index',
'Portugal_Country_Risk_RSSCPROR_Index',
'Qatar_Country_Risk_RSSCQTOR_Index',
'Romania_Country_Risk_RSSCROOR_Index',
'Russia_Country_Risk_RSSCRUOR_Index',
'Saudi_Arabia_Country_Risk_RSSCSDOR_Index',
'Singapore_Country_Risk_RSSCSNOR_Index',
'Slovakia_Country_Risk_RSSCSVOR_Index',
'Slovenia_Country_Risk_RSSCSLOR_Index',
'South_Africa_Country_Risk_RSSCSAOR_Index',
'South_Korea_Country_Risk_RSSCSKOR_Index',
'Spain_Country_Risk_RSSCSPOR_Index',
'Sri_Lanka_Country_Risk_RSSCSIOR_Index',
'Sweden_Country_Risk_RSSCSWOR_Index',
'Switzerland_Country_Risk_RSSCSTOR_Index',
'Taiwan_Country_Risk_RSSCTWOR_Index',
'Tanzania_Country_Risk_RSSCTZOR_Index',
'Thailand_Country_Risk_RSSCTHOR_Index',
'Tunisia_Country_Risk_RSSCTUOR_Index',
'Turkey_Country_Risk_RSSCTROR_Index',
'Ukraine_Country_Risk_RSSCUROR_Index',
'United_Kingdom_Country_Risk_RSSCUKOR_Index',
'United_Arab_Emirates_Country_Risk_RSSCUAOR_Index',
'United_States_Country_Risk_RSSCUSOR_Index',
'Uruguay_Country_Risk_RSSCUGOR_Index',
'Venezuela_Country_Risk_RSSCVNOR_Index',
'Vietnam_Country_Risk_RSSCVTOR_Index'
]

suffix = '_Country_Risk'
all_countries = []
for col in country_risk_columns:
    country_name = col
    if suffix in country_name:
        # Split at the suffix and take the first part
        country_name = country_name.split(suffix)[0]
    all_countries.append(country_name)

unique_countries = set(all_countries)

mapped_countries = set(country_to_region_map.keys())

unmapped_countries = unique_countries - mapped_countries

print(f"Found unique countries: {len(unique_countries)}")
print(f"Countries mapped: {len(mapped_countries)}")
print(f"Countries not mapped: {len(unmapped_countries)}")

if unmapped_countries:
    print("\nRESULT: Countries that were not mapped:")
    for country in sorted(list(unmapped_countries)):
        print(f"- {country}")
else:
    print("Meh")

Found unique countries: 82
Countries mapped: 60
Countries not mapped: 22

RESULT: Countries that were not mapped:
- Australia
- Austria
- Belgium
- Canada
- Cyprus
- Denmark
- Finland
- France
- Germany
- Greece
- Ireland
- Italy
- Luxembourg
- Netherlands
- New_Zealand
- Norway
- Portugal
- Spain
- Sweden
- Switzerland
- United_Kingdom
- United_States


In [74]:
countries_to_remove = [
    'Australia', 'Austria', 'Belgium', 'Canada', 'Cyprus', 'Denmark', 
    'Finland', 'France', 'Germany', 'Greece', 'Ireland', 'Italy', 
    'Luxembourg', 'Netherlands', 'New_Zealand', 'Norway', 'Portugal', 
    'Spain', 'Sweden', 'Switzerland', 'United_Kingdom', 'United_States'
]

cols_to_drop = [
    col for col in final_df.columns 
    if any(col.startswith(country + '_') for country in countries_to_remove)
]

print(f"Identified {len(cols_to_drop)} columns to remove.")

final_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

print(f"\nShape after removing developed countries: {final_df.shape}")

Identified 88 columns to remove.

Shape after removing developed countries: (37575, 233)


In [75]:
cols_to_drop = ['Bitcoin_XBTUSD_BGN_Curncy.1']
if 'Bitcoin_XBTUSD_BGN_Curncy.1' in final_df.columns:
    final_df.drop(columns=cols_to_drop, inplace=True)
final_df.columns.values

array(['Date', 'Region', 'Risikoaufschlag', 'EMBIG_Div_JPGCSOSD_Index.1',
       'High_Grade_JPGCIGSS_Index.1', 'High_Yield_JPGCHYSS_Index.1',
       'NYM_WTI_Rohöl_CL1_COMB_Comdty.1', 'ICE_Brent_Rohöl_CO1_Comdty.1',
       'NYM_Heizöl_HO1_COMB_Comdty.1', 'ICE_Gasöl_QS1_Comdty.1',
       'NYM_Erdgas_NG1_COMB_Comdty.1', 'Gas_Europa_TZT1_Comdty.1',
       'Gold_GC1_COMB_Comdty.1', 'Silver_SI1_COMB_Comdty.1',
       'Platinum_PL1_COMB_Comdty.1', 'Aluminum_LA1_Comdty.1',
       'Kupfer_HG1_COMB_Comdty.1', 'Blei_LL1_Comdty.1',
       'Nickel_LN1_Comdty.1', 'Zink_LX1_Comdty.1',
       'Cobalt_LMCODY_LME_Comdty.1', 'Eisenerz_IOE1_COMB_Comdty.1',
       'Uran_UXA1_Comdty.1', 'Zinn_LT1_Comdty.1',
       'Mais_C_1_COMB_Comdty.1', 'Sojabohnen_S_1_COMB_Comdty.1',
       'Weizen_W_1_Comdty.1', 'Zucker_SB1_Comdty.1', 'Kakao_CC1_Comdty.1',
       'Baumwolle_CT1_Comdty.1', 'Kaffee_KC1_Comdty.1',
       'Zinsen_10Y_DE_GDBR10_Index', 'Zinsen_10Y_US_USGG10YR_Index',
       'Zinskurve_2Y10Y_DE_DEYC2Y10_In

In [76]:
# save final_df as .h5
final_df.to_hdf(os.path.join(data_path, 'final_data.h5'), key='final', mode='w')

# save final_df as excel
final_df.to_excel(os.path.join(data_path, 'final_data.xlsx'), index=False)

In [None]:
# - fill dates ✅
# - long format mit countries zu region mapping ✅
# - (individual / multiple regions dataset)
# - visualization
# - feature engineering # - create new features (recent observations should have a higher significance than older ones, thus EWMAs were preferred)
# - labels
# - datasplits, model training & selection and validation...
# - find research papers


# -- what i did:
# - asien anleihenindex mit lag (1 quartal)
# - PMI PX_Last entfernt
# - BTC in Rohstoff bbg entfernt
# - Rohstoffindex entfernt da ich einzelne Rohstoffe habe
# - interleaved dates
# - dataset gecuttet zwischen den dates 03.01.2005 und 30.07.2025
# - restlichen dates befüllt (wochenenden, feiertage, ...)
# - ffill für alle Daten
# - df in long format umgewandelt
# - spezifische länderspalten den 5 regionen zugeordnet und statistische Kennzahlen berechnet


# - MSCI_ACWI_Value/Growth eig nicht relevant oder ?
# - ffill für monatliche/quartalsweise Daten?
# - 'Turkey': 'Middle_East', # Geografisch & ökonomisch oft in dieser Region gruppiert?