In [None]:
import pandas as pd
import csv
import os
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
from scipy import signal
from sklearn.preprocessing import MinMaxScaler

Load and preprocess the datasets

In [None]:
def first_notnull_year(df):
    first_column = df.columns[df.notnull().any()].tolist()[0]
    print(first_column)

def least_data(df):
    df = df.dropna(thresh=1).dropna(axis=1, thresh=1)
    
    return df

In [None]:
#note that gdp is divided by 1e6 to make it easier to read
gdpdf = pd.read_csv('Raw_Data\\Economical\\gpd_data_incurrentUSD.csv')
gdpdf = gdpdf.drop(['Country Name','Indicator Name','Indicator Code'], axis=1)
gdpdf = gdpdf.set_index('Country Code')
country_codes = pd.read_csv('Raw_data\\country_codes.csv')
gdpdf = gdpdf[gdpdf.index.isin(country_codes['alpha-3'])]
gdpdf = gdpdf.map(lambda x: x / 1e6 if x is not None else x)
gdpdf = gdpdf.round(2)
cols_to_drop = [col for col in gdpdf.columns if int(col) < 1980]
gdpdf = gdpdf.drop(columns=cols_to_drop)
gdpdf.columns = gdpdf.columns.astype(int)

print(gdpdf.columns.dtype)
print(gdpdf)

In [None]:
co2ktdf = pd.read_csv('Raw_Data\\Environmental\\CO2_Emissions_ByCountry_KT.csv')
co2ktdf = co2ktdf.drop(['Series Name','Series Code','Country Name'], axis=1)
col_names = co2ktdf.columns.tolist()
col_names[1:] = [name[:4] for name in col_names[1:]]
co2ktdf.columns = col_names
co2ktdf = co2ktdf.set_index('Country Code')
co2ktdf = co2ktdf[co2ktdf.index.isin(country_codes['alpha-3'])]
co2ktdf = co2ktdf.apply(pd.to_numeric, errors='coerce')
cols_to_drop = [col for col in co2ktdf.columns if int(col) < 1980]
co2ktdf = co2ktdf.drop(columns=cols_to_drop)
co2ktdf.columns = co2ktdf.columns.astype(int)

print(co2ktdf.columns.dtype)
print(co2ktdf)

In [None]:
lawindex = pd.read_csv('Raw_Data\\Legal\\Law_Index.csv')
lawindex = lawindex.drop(['Country Name','Indicator Name','Indicator Code'], axis=1)
lawindex = lawindex.set_index('Country Code')
lawindex = lawindex[lawindex.index.isin(country_codes['alpha-3'])]
cols_to_drop = [col for col in lawindex.columns if int(col) < 1980]
lawindex = lawindex.drop(columns=cols_to_drop)
lawindex.columns = lawindex.columns.astype(int)

print(lawindex.columns.dtype)
print(lawindex)

In [None]:
politicalstab = pd.read_csv('Raw_Data\\Political\\political_stability_index.csv')
politicalstab = politicalstab.drop(['Series Name','Series Code','Country Name'], axis=1)
col_names = politicalstab.columns.tolist()
col_names[1:] = [name[:4] for name in col_names[1:]]
politicalstab.columns = col_names
politicalstab = politicalstab.set_index('Country Code')
politicalstab = politicalstab[politicalstab.index.isin(country_codes['alpha-3'])]
politicalstab = politicalstab.apply(pd.to_numeric, errors='coerce')
cols_to_drop = [col for col in politicalstab.columns if int(col) < 1980]
politicalstab = politicalstab.drop(columns=cols_to_drop)
politicalstab.columns = politicalstab.columns.astype(int)

print(politicalstab.columns.dtype)
print(politicalstab)

In [None]:
ginindex = pd.read_csv('Raw_Data\\Legal\\Law_Index.csv')
ginindex = ginindex.drop(['Country Name','Indicator Name','Indicator Code'], axis=1)
ginindex = ginindex.set_index('Country Code')
ginindex = ginindex[ginindex.index.isin(country_codes['alpha-3'])]
cols_to_drop = [col for col in ginindex.columns if int(col) < 1980]
ginindex = ginindex.drop(columns=cols_to_drop)
ginindex.columns = ginindex.columns.astype(int)

print(ginindex.columns.dtype)
print(ginindex)

EXAMPLE TRANSFORM APPLICATION

interpolation methods can be inserted between and changed in the cell below and also min max scaling can be applied
Extract the method!!!

In [None]:
# Shorten all series aligned with the shortest one (INPUT AS A SERIE LIST!!!)

def shorten_series(series_list):
    first_valid_indices = [s.first_valid_index() for s in series_list]
    max_index = max(first_valid_indices)
    shortened_series_list = [s.loc[max_index:] for s in series_list]
    return shortened_series_list



# Define some transformation functions : (WORKS FOR EACH SERIE)
def transform_full(df_serie):
    scaler = MinMaxScaler()
    
    #interpolate the data
    df_serie = df_serie.interpolate()

    # Drop the NaN values and apply them back after detrending
    # Assuming that `df_serie` is your Series
    # Store the original indices of the NaN values
    nan_indices = df_serie.index[df_serie.apply(np.isnan)]

    # Drop the NaN values
    df_serie_dropped = df_serie.dropna()

    # Detrend the Series without NaN values
    df_serie_detrended = pd.Series(signal.detrend(df_serie_dropped))
    df_serie_detrended.index = df_serie_dropped.index

    # Reinsert the NaN values to their original places
    for idx in nan_indices:
        df_serie_detrended.loc[idx] = np.nan

    # Sort the Series by index to restore the original order
    df_serie_detrended = df_serie_detrended.sort_index()

    # Scale the data
    df_serie = pd.Series(scaler.fit_transform(df_serie.values.reshape(-1, 1)).flatten(), index=df_serie.index)
    return df_serie

# WORKS ONLY FOR SERIE
def onlyscale(df_serie):
    scaler = MinMaxScaler()
    # Scale the data
    scaled_serie = pd.Series(scaler.fit_transform(df_serie.values.reshape(-1, 1)).flatten(), index=df_serie.index)
    scaled_serie.name = df_serie.name  # Preserve the name
    return scaled_serie

In [None]:
def get_country_serie_list(country_code):
    countrydf1 = gdpdf.loc[country_code].rename('Gdp'+' '+country_code)
    countrydf2 = co2ktdf.loc[country_code].rename('CO2'+' '+country_code)
    countrydf3 = lawindex.loc[country_code].rename('Lawindex'+' '+country_code)
    countrydf4 = politicalstab.loc[country_code].rename('Politicalstab'+' '+country_code)
    countrydf5 = ginindex.loc[country_code].rename('Ginindex'+' '+country_code)
    return [countrydf1, countrydf2, countrydf3, countrydf4, countrydf5]


In [None]:
# Select the data for the country
code = 'TUR'
countrydf1 = gdpdf.loc[code]
countrydf2 = co2ktdf.loc['USA']
countrydf3 = lawindex.loc['USA']
countrydf4 = politicalstab.loc['USA']
countrydf5 = ginindex.loc['USA']

usa_list = [countrydf1, countrydf2, countrydf3, countrydf4, countrydf5]

merged_preprocess_df = pd.concat([countrydf1, countrydf2, countrydf3, countrydf4, countrydf5], axis=1)
new_column_names = ['Gdp'+' '+'USA', 'CO2'+' '+'USA', 'Lawindex'+' '+'USA', 'Politicalstab'+' '+'USA', 'Ginindex'+' '+'USA']
merged_preprocess_df.columns = new_column_names
merged_preprocess_df.to_csv('merged_preprocess_df.csv')

In [None]:
#Fetch turkey data as series list for each variable
alpha3code = 'TUR'
turkey_preprocess_serie_list = get_country_serie_list(alpha3code)
turkey_preprocess_df = pd.concat(turkey_preprocess_serie_list, axis=1)
display(turkey_preprocess_df)
turkey_preprocess_df.plot(subplots=True, figsize=(10, 30), style='o-')

In [None]:
#Shorten the series to the same length starting from the highest indiced non-null value
shortened_turkey_preprocess_serie_list = shorten_series(turkey_preprocess_serie_list)
shortened_turkey_preprocess_df = pd.concat(shortened_turkey_preprocess_serie_list, axis=1)
display(shortened_turkey_preprocess_df)
shortened_turkey_preprocess_df.plot(subplots=True, figsize=(7, 30),style='o-')

In [None]:
#please change interpolation method if needed!!
interpolated_turkey_preprocess_serie_list = [serie.interpolate(method='spline',order = 2) for serie in shortened_turkey_preprocess_serie_list]
turkey_interpolated_preprocess_df = pd.concat(interpolated_turkey_preprocess_serie_list, axis=1)
display(turkey_interpolated_preprocess_df)
turkey_interpolated_preprocess_df.plot(subplots=True, figsize=(7, 20),style='o-')

In [None]:
scaled_turkey_preprocess_serie_list = [onlyscale(serie) for serie in interpolated_turkey_preprocess_serie_list]
scaled_turkey_preprocess_df = pd.concat(scaled_turkey_preprocess_serie_list, axis=1)
display(scaled_turkey_preprocess_df)
scaled_turkey_preprocess_df.plot(figsize=(15, 5),style='o-')

In [None]:
turkey_detrended_series_list = interpolated_turkey_preprocess_serie_list.copy()
for i in range(2):  # Loop over the first two series
    detrended_array = signal.detrend(interpolated_turkey_preprocess_serie_list[i].values)
    turkey_detrended_series_list[i] = pd.Series(detrended_array, 
                                                index=interpolated_turkey_preprocess_serie_list[i].index, 
                                                name=interpolated_turkey_preprocess_serie_list[i].name)

turkey_gdp_detrended = turkey_detrended_series_list[0]
turkey_gdp_detrended.plot(figsize=(10, 3),style='o-',title= turkey_gdp_detrended.name,)

In [None]:
turkey_co2_detrended = turkey_detrended_series_list[1]
turkey_co2_detrended.plot(figsize=(10, 3),style='o-',title= turkey_co2_detrended.name)

In [None]:
turkey_fulltransform_series_list = [onlyscale(serie) for serie in turkey_detrended_series_list]
#note that it's spline interpolated with order of 2
turkey_final_df = pd.concat(turkey_fulltransform_series_list, axis=1)
display(turkey_final_df)
turkey_final_df.plot(figsize=(18, 5),style='o-')