In [1]:
import pandas as pd
import csv
import os
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
from scipy import signal
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns

gdpdf = pd.read_csv('Raw_Data\\Economical\\gpd_data_incurrentUSD.csv')
gdpdf = gdpdf.drop(['Country Name','Indicator Name','Indicator Code'], axis=1)
gdpdf = gdpdf.set_index('Country Code')
country_codes = pd.read_csv('Raw_data\\country_codes.csv')
gdpdf = gdpdf[gdpdf.index.isin(country_codes['alpha-3'])]
gdpdf = gdpdf.map(lambda x: x / 1e6 if x is not None else x)
gdpdf = gdpdf.round(2)
cols_to_drop = [col for col in gdpdf.columns if int(col) < 1990]
gdpdf = gdpdf.drop(columns=cols_to_drop)
gdpdf.columns = gdpdf.columns.astype(int)

co2ktdf = pd.read_csv('Raw_Data\\Environmental\\CO2_Emissions_ByCountry_KT.csv')
co2ktdf = co2ktdf.drop(['Series Name','Series Code','Country Name'], axis=1)
col_names = co2ktdf.columns.tolist()
col_names[1:] = [name[:4] for name in col_names[1:]]
co2ktdf.columns = col_names
co2ktdf = co2ktdf.set_index('Country Code')
co2ktdf = co2ktdf[co2ktdf.index.isin(country_codes['alpha-3'])]
co2ktdf = co2ktdf.apply(pd.to_numeric, errors='coerce')
cols_to_drop = [col for col in co2ktdf.columns if int(col) < 1990]
co2ktdf = co2ktdf.drop(columns=cols_to_drop)
co2ktdf.columns = co2ktdf.columns.astype(int)

lawindex = pd.read_csv('Raw_Data\\Legal\\Law_Index.csv')
lawindex = lawindex.drop(['Country Name','Indicator Name','Indicator Code'], axis=1)
lawindex = lawindex.set_index('Country Code')
lawindex = lawindex[lawindex.index.isin(country_codes['alpha-3'])]
cols_to_drop = [col for col in lawindex.columns if int(col) < 1990]
lawindex = lawindex.drop(columns=cols_to_drop)
lawindex.columns = lawindex.columns.astype(int)

politicalstab = pd.read_csv('Raw_Data\\Political\\political_stability_index.csv')
politicalstab = politicalstab.drop(['Series Name','Series Code','Country Name'], axis=1)
col_names = politicalstab.columns.tolist()
col_names[1:] = [name[:4] for name in col_names[1:]]
politicalstab.columns = col_names
politicalstab = politicalstab.set_index('Country Code')
politicalstab = politicalstab[politicalstab.index.isin(country_codes['alpha-3'])]
politicalstab = politicalstab.apply(pd.to_numeric, errors='coerce')
cols_to_drop = [col for col in politicalstab.columns if int(col) < 1990]
politicalstab = politicalstab.drop(columns=cols_to_drop)
politicalstab.columns = politicalstab.columns.astype(int)

ginindex = pd.read_csv('Raw_Data\\Legal\\Law_Index.csv')
ginindex = ginindex.drop(['Country Name','Indicator Name','Indicator Code'], axis=1)
ginindex = ginindex.set_index('Country Code')
ginindex = ginindex[ginindex.index.isin(country_codes['alpha-3'])]
cols_to_drop = [col for col in ginindex.columns if int(col) < 1990]
ginindex = ginindex.drop(columns=cols_to_drop)
ginindex.columns = ginindex.columns.astype(int)

techexpenditure = pd.read_csv('Raw_Data\\Technological\\API_GB.XPD.RSDV.GD.ZS_DS2_en_csv_v2_4.csv')
techexpenditure = techexpenditure.drop(['Country Name','Indicator Name','Indicator Code'], axis=1)
techexpenditure = techexpenditure.set_index('Country Code')
techexpenditure = techexpenditure[techexpenditure.index.isin(country_codes['alpha-3'])]
cols_to_drop = [col for col in techexpenditure.columns if int(col) < 1990]
techexpenditure = techexpenditure.drop(columns=cols_to_drop)
techexpenditure.columns = techexpenditure.columns.astype(int)


# Step 1
# Fetches the country data series from the dataframes and returns them as a list of series
def get_country_serie_list(country_code):
    countrydf1 = gdpdf.loc[country_code].rename('Gdp'+' '+country_code)
    countrydf2 = co2ktdf.loc[country_code].rename('CO2'+' '+country_code)

    countrydf3 = lawindex.loc[country_code].rename('Lawindex'+' '+country_code)
    countrydf4 = politicalstab.loc[country_code].rename('Politicalstab'+' '+country_code)
    countrydf5 = ginindex.loc[country_code].rename('Ginindex'+' '+country_code)
    
    country_serie_list = [countrydf1, countrydf2, countrydf3, countrydf4, countrydf5]
    return country_serie_list

# Step 2
# This function shortens the list of series to equal lenght aligned with the shortest serie
def shorten_series(series_list):
    first_valid_indices = [s.first_valid_index() for s in series_list]
    max_index = max(first_valid_indices)
    shortened_series_list = [s.loc[max_index:] for s in series_list]
    return shortened_series_list

# Step 3
# Change method if required!
def interpolator(serie_list):
    interpolated_serie_list = [serie.interpolate(method='linear') for serie in serie_list]
    return interpolated_serie_list

# Step 4
# Detrend the first two series in the list (which are gdp and co2)
def detrend_first_two(series_list):
    detrended_series_list = series_list.copy()
    for i in range(3):  # Loop over the first three series
        detrended_array = signal.detrend(series_list[i].values)
        detrended_series_list[i] = pd.Series(detrended_array, 
                                             index=series_list[i].index, 
                                             name=series_list[i].name)
        return detrended_series_list

# Step 5
# This function scales the series in the list
def scale_series_list(series_list):
    scaler = MinMaxScaler()
    scaled_series_list = []  
    for df_serie in series_list:
        # Scale the data
        scaled_serie = pd.Series(scaler.fit_transform(df_serie.values.reshape(-1, 1)).flatten(), index=df_serie.index)
        scaled_serie.name = df_serie.name  # Preserve the name
        scaled_series_list.append(scaled_serie)
    
    return scaled_series_list

In [2]:
import statsmodels.api as sm

usalists = get_country_serie_list('USA')
usalists = shorten_series(usalists)
usalists = interpolator(usalists)
usalists = detrend_first_two(usalists)
usalists = scale_series_list(usalists)

x = pd.concat(usalists[1:5])
y = usalists[0].to_frame()