Load libraries

In [None]:
import pandas as pd
import csv
import os
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
from scipy import signal
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from scipy.stats import shapiro, kstest, norm
import matplotlib.pyplot as plt
import statsmodels.api as sm
import openpyxl

Load the datasets

In [None]:
country_codes = pd.read_csv('Raw_data\\country_codes.csv')
display(country_codes.head(3))

In [None]:
gdpdf = pd.read_csv('Raw_Data\\Economical\\gpd_data_incurrentUSD.csv')
gdpdf = gdpdf.drop(['Country Name','Indicator Name','Indicator Code'], axis=1)
gdpdf = gdpdf.set_index('Country Code')
gdpdf = gdpdf[gdpdf.index.isin(country_codes['alpha-3'])]
gdpdf = gdpdf.map(lambda x: x / 1e6 if x is not None else x)
gdpdf = gdpdf.round(2)
cols_to_drop = [col for col in gdpdf.columns if int(col) < 2002]
gdpdf = gdpdf.drop(columns=cols_to_drop)
gdpdf.columns = gdpdf.columns.astype(int)
display(gdpdf.head(3))

In [None]:
co2ktdf = pd.read_csv('Raw_Data\\Environmental\\CO2_Emissions_ByCountry_KT.csv')
co2ktdf = co2ktdf.drop(['Series Name','Series Code','Country Name'], axis=1)
col_names = co2ktdf.columns.tolist()
col_names[1:] = [name[:4] for name in col_names[1:]]
co2ktdf.columns = col_names
co2ktdf = co2ktdf.set_index('Country Code')
co2ktdf = co2ktdf[co2ktdf.index.isin(country_codes['alpha-3'])]
co2ktdf = co2ktdf.apply(pd.to_numeric, errors='coerce')
cols_to_drop = [col for col in co2ktdf.columns if int(col) < 2002]
co2ktdf = co2ktdf.drop(columns=cols_to_drop)
co2ktdf.columns = co2ktdf.columns.astype(int)
display(co2ktdf.head(3))

In [None]:
lawindex = pd.read_csv('Raw_Data\\Legal\\Law_Index.csv')
lawindex = lawindex.drop(['Country Name','Indicator Name','Indicator Code'], axis=1)
lawindex = lawindex.set_index('Country Code')
lawindex = lawindex[lawindex.index.isin(country_codes['alpha-3'])]
cols_to_drop = [col for col in lawindex.columns if int(col) < 2002]
lawindex = lawindex.drop(columns=cols_to_drop)
lawindex.columns = lawindex.columns.astype(int)
display(lawindex.head(3))

In [None]:
politicalstab = pd.read_csv('Raw_Data\\Political\\political_stability_index.csv')
politicalstab = politicalstab.drop(['Series Name','Series Code','Country Name'], axis=1)
col_names = politicalstab.columns.tolist()
col_names[1:] = [name[:4] for name in col_names[1:]]
politicalstab.columns = col_names
politicalstab = politicalstab.set_index('Country Code')
politicalstab = politicalstab[politicalstab.index.isin(country_codes['alpha-3'])]
politicalstab = politicalstab.apply(pd.to_numeric, errors='coerce')
cols_to_drop = [col for col in politicalstab.columns if int(col) < 2002]
politicalstab = politicalstab.drop(columns=cols_to_drop)
politicalstab.columns = politicalstab.columns.astype(int)
display(politicalstab.head(3))

In [None]:
ginindex = pd.read_csv('Raw_Data\\Social\\Gini_Index_(0good-100bad).csv')
ginindex = ginindex.drop(['Country Name','Indicator Name','Indicator Code'], axis=1)
ginindex = ginindex.set_index('Country Code')
ginindex = ginindex[ginindex.index.isin(country_codes['alpha-3'])]
cols_to_drop = [col for col in ginindex.columns if int(col) < 2002]
ginindex = ginindex.drop(columns=cols_to_drop)
ginindex.columns = ginindex.columns.astype(int)
display(ginindex.head(3))

In [None]:
techexpenditure1 = pd.read_csv('Raw_Data\\Technological\\API_GB.XPD.RSDV.GD.ZS_DS2_en_csv_v2_4.csv')
techexpenditure1 = techexpenditure1.drop(['Country Name','Indicator Name','Indicator Code'], axis=1)
techexpenditure1 = techexpenditure1.set_index('Country Code')
techexpenditure1 = techexpenditure1[techexpenditure1.index.isin(country_codes['alpha-3'])]
cols_to_drop = [col for col in techexpenditure1.columns if int(col) < 2002]
techexpenditure1 = techexpenditure1.drop(columns=cols_to_drop)
techexpenditure1.columns = techexpenditure1.columns.astype(int)
techexpenditure = techexpenditure1 * gdpdf
display(techexpenditure.head(3))

Non null check

In [None]:
def print_non_null_counts(dataframes):
    for name, df in dataframes.items():
        total_non_null = df.count().sum()
        print(f"Total non-null values in {name} dataframe:", total_non_null)

# Dictionary of dataframes
dataframes = {
    'gdpdf': gdpdf,
    'co2ktdf': co2ktdf,
    'lawindex': lawindex,
    'politicalstab': politicalstab,
    'ginindex': ginindex,
    'techexpenditure': techexpenditure
}

print_non_null_counts(dataframes)

# For row count, you can still keep it separate if you only need it for 'gdpdf'
row_count = gdpdf.shape[0]
print("Row count:", row_count)

# Display the 'gdpdf' dataframe
display(gdpdf)

Define the preprocess functions to use

In [None]:
# Step 1
# Fetches the country data series from the dataframes and returns them as a list of series
def get_country_serie_list(country_code):
    countrydf1 = gdpdf.loc[country_code].rename('Gdp'+' '+country_code)
    countrydf2 = co2ktdf.loc[country_code].rename('CO2'+' '+country_code)
    # countrydf6 = techexpenditure.loc[country_code].rename('Techexpenditure'+' '+country_code) 
    countrydf3 = lawindex.loc[country_code].rename('Lawindex'+' '+country_code)
    countrydf4 = politicalstab.loc[country_code].rename('Politicalstab'+' '+country_code)
    countrydf5 = ginindex.loc[country_code].rename('Ginindex'+' '+country_code)
    
    country_serie_list = [countrydf1, countrydf2, countrydf3, countrydf4, countrydf5]
    return country_serie_list

In [None]:
# Step 2
# This function shortens the list of series to equal lenght aligned with the shortest serie
def shorten_series(series_list):
    first_valid_indices = [s.first_valid_index() for s in series_list]
    max_index = max(first_valid_indices)
    shortened_series_list = [s.loc[max_index:] for s in series_list]
    return shortened_series_list

In [None]:
# Step 3
# Change method if required!
def interpolator(serie_list):
    interpolated_serie_list = [serie.interpolate(method='linear') for serie in serie_list]
    return interpolated_serie_list

In [None]:
# Step 4
# Detrend the first two series in the list (which are gdp and co2)
def detrend_first_two(series_list):
    detrended_series_list = series_list.copy()
    for i in range(2):  # Loop over the first two series
        detrended_array = signal.detrend(series_list[i].values)
        detrended_series_list[i] = pd.Series(detrended_array, 
                                             index=series_list[i].index, 
                                             name=series_list[i].name)
        return detrended_series_list

In [None]:
# Step 5
# This function scales the series in the list
def scale_series_list(series_list):
    scaler = MinMaxScaler()
    scaled_series_list = []  
    for df_serie in series_list:
        # Scale the data
        scaled_serie = pd.Series(scaler.fit_transform(df_serie.values.reshape(-1, 1)).flatten(), index=df_serie.index)
        scaled_serie.name = df_serie.name  # Preserve the name
        scaled_series_list.append(scaled_serie)
    
    return scaled_series_list

In [None]:
def plot_with_trendlines(df):
    """
    Plots each column in the DataFrame as a subplot with a trendline and displays the trendline equation.
    
    Parameters:
    df (DataFrame): The DataFrame containing the data to plot.
    """
    fig, axes = plt.subplots(nrows=len(df.columns), ncols=1, figsize=(6, 10))
    
    for i, col in enumerate(df.columns):
        # Plot the original data
        df[col].plot(style='o-', ax=axes[i], title=col)
        
        # Calculate the trendline
        z = np.polyfit(df.index, df[col], 1)
        p = np.poly1d(z)
        
        # Get the trendline equation as a string
        trendline_eq = f"y = {z[0]:.2f}x + {z[1]:.2f}"
        
        # Plot the trendline
        plt.sca(axes[i])
        plt.plot(df.index, p(df.index), "r--", label='Trend')
        
        # Set x-axis ticks as integers with intervals of 3
        axes[i].set_xticks(np.arange(min(df.index), max(df.index)+1, 3))
        
        # Annotate the trendline equation
        plt.text(0.05, 0.95, trendline_eq, transform=axes[i].transAxes, fontsize=9,
                 verticalalignment='top', bbox=dict(boxstyle="round,pad=0.3", edgecolor='red', facecolor='white'))
        
        # Add a legend
        # axes[i].legend()
    
    plt.tight_layout()
    plt.show()

# Example usage:
# plot_with_trendlines(fin_table_4)

In [None]:
def assess_normality(df):
    """
    Performs Shapiro-Wilk and Kolmogorov-Smirnov tests on each column of the DataFrame.
    Also generates Q-Q plots and histograms for visual assessment of normality.
    
    Parameters:
    df (DataFrame): The DataFrame containing the data to test.
    """
    results = {}
    for column in df.columns:
        # Shapiro-Wilk Test
        stat_shapiro, p_value_shapiro = shapiro(df[column])
        # Kolmogorov-Smirnov Test
        stat_ks, p_value_ks = kstest(df[column], 'norm', args=(df[column].mean(), df[column].std()))
        
        results[column] = {
            'Shapiro-Wilk Stat': stat_shapiro,
            'Shapiro-Wilk P-Value': p_value_shapiro,
            'Kolmogorov-Smirnov Stat': stat_ks,
            'Kolmogorov-Smirnov P-Value': p_value_ks
        }
        
        # Q-Q plot
        sm.qqplot(df[column], line='s')
        plt.title(f"Q-Q Plot for {column}")
        plt.show()
        
        # Histogram
        #plt.figure()
        #df[column].hist(bins=20, edgecolor='black')
        #plt.title(f"Histogram for {column}")
        #plt.show()
    
    return pd.DataFrame(results)

# Example usage:
# results_df = assess_normality(your_dataframe)
# print(results_df)

Apply preprocess

In [None]:
alpha3_codes = country_codes['alpha-3'].tolist()
failed_codes = []  # List to keep track of failed alpha-3 codes

display(alpha3_codes)

dataframes = []  # List to store series converted into dataframes
failed_codes = []  # List to keep track of failed alpha-3 codes

for country_code in alpha3_codes:
    try:
        country_series_list = scale_series_list(detrend_first_two(interpolator(shorten_series(get_country_serie_list(country_code)))))
        country_df = pd.concat(country_series_list, axis=1)
        print(country_df.columns[0], country_df.index[0])
        dataframes.append(country_df)  # Append the dataframe to the list
    except Exception as e:
        print(f"Failed to process alpha-3 code: {country_code}. Error: {str(e)}")
        failed_codes.append(country_code)

print(f"Failed alpha-3 codes: {failed_codes}")

display(len(dataframes))

alpha3_codes = [code for code in alpha3_codes if code not in failed_codes]

Apply correlation iteratively

In [None]:
correlation_matrices = []

for df in dataframes:
    corr_matrix = df.corr(method = 'pearson')
    correlation_matrices.append(corr_matrix)

index_list = []
values_list = []

# Iterate over the correlation matrices
for corr_matrix in correlation_matrices:
    # Iterate over the items in the matrix
    for pair, value in corr_matrix.unstack().items():
        # Add the pair to the index list and the value to the values list
        index_list.append(pair)
        values_list.append(value)

# Create a MultiIndex from the index list
index = pd.MultiIndex.from_tuples(index_list, names=['Variable 1', 'Variable 2'])

# Create a Series with the MultiIndex and the values list
correlation_output = pd.Series(values_list, index=index)

display(correlation_output)

Cosmetic upgrades

In [None]:
indexfixed = correlation_output.reset_index()
indexfixed.columns = ['Variable 1', 'Variable 2', 'Value']
display(indexfixed)

# Filter out pairs where the variables are the same Gdp-Gdp or CO2-CO2 Etc.
indexfixed_filtered = indexfixed[(indexfixed['Variable 1'] != indexfixed['Variable 2'])]
display(indexfixed_filtered.head(20))

Import income classifications and filter it aligned with the current data

In [None]:
income_alpha3 = pd.read_csv('world-bank-income-groups.csv')
income_alpha3_filtered = income_alpha3[income_alpha3['Code'].isin(alpha3_codes)]
income_alpha3_filtered = income_alpha3_filtered[income_alpha3_filtered['Year'] == 2020]
income_alpha3_filtered = income_alpha3_filtered[income_alpha3_filtered['Code'].isin(alpha3_codes)]
income_alpha3_filtered = income_alpha3_filtered.reset_index(drop=True)
income_alpha3_filtered = income_alpha3_filtered.rename(columns={"World Bank's income classification" : 'Income Group'})

#venezuela missing so add it
new_row = {'Entity': 'Venezuela', 'Code': 'VEN', 'Year': '2020', 'Income Group': 'Upper-middle-income countries'}
income_alpha3_filtered = pd.concat([income_alpha3_filtered, pd.DataFrame([new_row])], ignore_index=True)

display(income_alpha3_filtered)

Join with additional data (income group)

In [None]:
# Assigning 'Code' using .loc
indexfixed_filtered.loc[:, 'Code'] = indexfixed_filtered['Variable 2'].str[-3:]

# Merging dataframes
final_all_corr = indexfixed_filtered.merge(income_alpha3_filtered[['Code', 'Entity', "Income Group"]], on='Code', how='left')

# Splitting 'Variable 1' and 'Variable 2' and assigning the first part using .loc
final_all_corr.loc[:, 'Variable 1'] = final_all_corr['Variable 1'].str.split(' ', expand=True)[0]
final_all_corr.loc[:, 'Variable 2'] = final_all_corr['Variable 2'].str.split(' ', expand=True)[0]

display(final_all_corr)

You can change or eliminate the interval here

In [None]:
strong_verystrong = final_all_corr[(final_all_corr['Value'] >= 0.6) | (final_all_corr['Value'] <= -0.6)]
display(strong_verystrong.head(5))

var1 = ['Gdp', 'Politicalstab', 'Ginindex']
var2 = ['CO2', 'Lawindex']

strong_verystrong_filtered = strong_verystrong[strong_verystrong['Variable 1'].isin(var1) & strong_verystrong['Variable 2'].isin(var2)]
display(strong_verystrong_filtered)
# strong_verystrong_filtered.to_excel('outputs\\strong_verystrong_filtered.xlsx', index=False)

In [None]:
unique_combinations = strong_verystrong_filtered.groupby(['Variable 1', 'Variable 2']).size().reset_index().drop(columns=0)
display(unique_combinations)

In [None]:
groupedby_variablepair = strong_verystrong_filtered.groupby(['Variable 1', 'Variable 2', 'Income Group'])
list_of_dfs = [group for _, group in groupedby_variablepair]

display((list_of_dfs[0]))
display(len(list_of_dfs))

processed_listof_dfs = []

for df in list_of_dfs:
    df['Variable Pair'] = df['Variable 1'] + '-' + df['Variable 2']
    df.drop(columns=['Variable 1', 'Variable 2'], inplace=True)
    processed_listof_dfs.append(df)
    
display(strong_verystrong_filtered)

for df in processed_listof_dfs:
    print(df['Variable Pair'].iloc[0])

In [None]:
hopefullyfinal = strong_verystrong_filtered.copy()
hopefullyfinal['Variable Pair'] = hopefullyfinal['Variable 1'] + '-' + hopefullyfinal['Variable 2']
hopefullyfinal.drop(columns=['Variable 1', 'Variable 2','Entity'], inplace=True)
display(hopefullyfinal)


In [None]:
# Assuming your dataframe is named 'hopefullyfinal'
# Replace 'Value', 'Income Group', and 'Variable Pair' with actual column names

# Create histograms for each 'Variable Pair'
unique_variable_pairs = hopefullyfinal['Variable Pair'].unique()
num_bins = 20

# Define colors for each income group
income_group_colors = {
    'Low-income countries': 'red',
    'Lower-middle-income countries': 'orange',
    'Upper-middle-income countries': 'yellow',
    'High-income countries': 'green'
}

# Define the bin edges for the histogram/bar chart
bin_edges = np.linspace(-1, 1, num_bins+1)

# Create a figure and a grid of subplots
fig, axs = plt.subplots(len(unique_variable_pairs), figsize=(12, 4*len(unique_variable_pairs)))

for ax, var_pair in zip(axs, unique_variable_pairs):
    subset_df = hopefullyfinal[hopefullyfinal['Variable Pair'] == var_pair]
    
    # Plot stacked bar charts for each income group
    bottom = np.zeros(num_bins)
    for income_group, color in income_group_colors.items():
        group_subset_df = subset_df[subset_df['Income Group'] == income_group]
        
        # Calculate histogram values without plotting
        hist_values, _ = np.histogram(group_subset_df['Value'], bins=bin_edges)
        
        # Plot as bar chart, stacking on top of previous bars
        ax.bar(bin_edges[:-1], hist_values, width=0.1, bottom=bottom, edgecolor='black', alpha=0.7, label=income_group, color=color)
        bottom += hist_values  # Update bottom array to stack bars
    
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')
    ax.set_title(f'Stacked Bar Chart for {var_pair}')
    ax.set_xticks(np.arange(-1, 1.1, 0.1))
    ax.set_xticklabels([round(-1 + i * 0.1, 1) for i in range(21)], rotation=90)  # Customize x-axis ticks
    ax.grid(axis='y')
    ax.set_yticks(np.arange(0, max(bottom) + 1, 1))  # Customize y-axis ticks
    ax.legend()  # Show legend with income group labels
    ax.set_ylim(0, max(bottom) * 1.1)  # Adjust the multiplier as needed to add more or less offset

plt.tight_layout()
plt.show()


In [None]:
usa_serie_list = get_country_serie_list('USA')
usa_serie_list = scale_series_list(detrend_first_two(interpolator(shorten_series(usa_serie_list))))
usa_df = pd.concat(usa_serie_list, axis=1)
display(usa_df)

my_series = usa_serie_list[0]

# Assuming you have a valid usa_df DataFrame
my_series.plot(subplots=True, figsize=(10, 3), style='o-', colormap='viridis', sharex=False)

final_all_corr[(final_all_corr['Code'] == 'USA') & (final_all_corr['Variable 1'] == 'Gdp')]




In [None]:
import pandas as pd
import numpy as np

# Given data (replace with your actual data)


# Create a DataFrame
df = pd.DataFrame(usa_df)

# Calculate moving averages for each series
window_size = 3
for col in ['CO2 USA', 'Lawindex USA', 'Politicalstab USA', 'Ginindex USA']:
    df[f'{col}_MA'] = df[col].rolling(window=window_size, min_periods=1).mean()

# Calculate correlations between GDP and other series
correlations = df[['Gdp USA', 'CO2 USA_MA', 'Lawindex USA_MA', 'Politicalstab USA_MA', 'Ginindex USA_MA']].corr()
weights = correlations.loc['Gdp USA'][1:]  # Exclude GDP itself
weights /= weights.sum()  # Normalize weights

# Combine moving averages using correlation values as weights
df['Gdp_Forecast'] = np.dot(df[['CO2 USA_MA', 'Lawindex USA_MA', 'Politicalstab USA_MA', 'Ginindex USA_MA']], weights)

# Extrapolate to 2026 (assuming linear trend)
last_year = df.index.max()
slope = (df['Gdp_Forecast'].iloc[-1] - df['Gdp_Forecast'].iloc[-2]) / (last_year - 2022)
forecast_2026 = df['Gdp_Forecast'].iloc[-1] + slope * (2026 - last_year)
print(f"GDP Forecast for 2026: {forecast_2026:.6f}")
