In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [None]:
from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2
import pycountry

## Compute average percentage of gdp per country within the same group

### Prepare variables with IMF repartition

In [None]:
advanced_economies = [
    "Andorra", "Australia", "Austria", "Belgium", "Canada", "Croatia", "Cyprus", "Czech Republic",
    "Denmark", "Estonia", "Finland", "France", "Germany", "Greece", "Hong Kong SAR", "Iceland",
    "Ireland", "Israel", "Italy", "Japan", "Korea. Republic of", "Latvia", "Lithuania", "Luxembourg",
    "Macao SAR", "Malta", "Netherlands", "New Zealand", "Norway", "Portugal", "Puerto Rico", "San Marino",
    "Singapore", "Slovak Republic", "Slovenia", "Spain", "Sweden", "Switzerland", "Taiwan Province of China",
    "United Kingdom", "United States"
]

emerging_developing_europe = [
    "Albania", "Belarus", "Bosnia and Herzegovina", "Bulgaria", "Hungary", "Kosovo", "Moldova",
    "Montenegro", "North Macedonia", "Poland", "Romania", "Russian Federation", "Serbia", "Türkiye. Republic of",
    "Ukraine"
]

emerging_developing_asia = [
    "Bangladesh", "Bhutan", "Brunei Darussalam", "Cambodia", "China. People's Republic of", "Fiji",
    "India", "Indonesia", "Kiribati", "Lao P.D.R.", "Malaysia", "Maldives", "Marshall Islands",
    "Micronesia. Fed. States of", "Mongolia", "Myanmar", "Nauru", "Nepal", "Palau", "Papua New Guinea",
    "Philippines", "Samoa", "Solomon Islands", "Sri Lanka", "Thailand", "Timor-Leste", "Tonga", "Tuvalu",
    "Vanuatu", "Vietnam"
]

latin_america_caribbean = [
    "Antigua and Barbuda", "Argentina", "Bahamas. The", "Barbados", "Belize", "Bolivia", "Brazil", "Chile",
    "Colombia", "Costa Rica", "Curacao", "Dominica", "Dominican Republic", "Ecuador", "El Salvador", "Grenada",
    "Guatemala", "Guyana", "Haiti", "Honduras", "Jamaica", "Mexico", "Nicaragua", "Panama", "Paraguay", "Peru",
    "Saint Kitts and Nevis", "Saint Lucia", "Saint Vincent and the Grenadines", "Sint Maarten", "Suriname",
    "Trinidad and Tobago", "Uruguay", "Venezuela"
]

sub_saharan_africa = [
    "Angola", "Benin", "Botswana", "Burkina Faso", "Burundi", "Cabo Verde", "Cameroon", "Central African Republic",
    "Chad", "Comoros", "Congo. Dem. Rep. of the", "Congo. Republic of", "Côte d'Ivoire", "Equatorial Guinea",
    "Eritrea", "Eswatini", "Ethiopia", "Gabon", "Gambia. The", "Ghana", "Guinea", "Guinea-Bissau", "Kenya",
    "Lesotho", "Liberia", "Madagascar", "Malawi", "Mali", "Mauritius", "Mozambique", "Namibia", "Niger",
    "Nigeria", "Rwanda", "Senegal", "Seychelles", "Sierra Leone", "South Africa", "South Sudan. Republic of",
    "São Tomé and Príncipe", "Tanzania", "Togo", "Uganda", "Zambia", "Zimbabwe"
]

middle_east_central_asia = [
    "Afghanistan", "Algeria", "Armenia", "Azerbaijan", "Bahrain", "Djibouti", "Egypt", "Georgia", "Iran",
    "Iraq", "Jordan", "Kazakhstan", "Kuwait", "Kyrgyz Republic", "Lebanon", "Libya", "Mauritania", "Morocco",
    "Oman", "Pakistan", "Qatar", "Saudi Arabia", "Somalia", "Sudan", "Syria", "Tajikistan", "Tunisia",
    "Turkmenistan", "United Arab Emirates", "Uzbekistan", "West Bank and Gaza", "Yemen"
]

dict_groups = {"advanced_economies": advanced_economies, 
               "emerging_developing_europe": emerging_developing_europe, 
               "emerging_developing_asia": emerging_developing_asia, 
               "latin_america_caribbean": latin_america_caribbean, 
               "sub_saharan_africa": sub_saharan_africa, 
               "middle_east_central_asia": middle_east_central_asia
              }

In [None]:
dict_groups

### Read csv, preprocess data

In [None]:
df_unprocessed = pd.read_csv('imf_ppp.csv')

In [None]:
# count no data
lines_no_data = df_unprocessed.apply(lambda row: row[row == "no data"].count(), axis=1)
no_data_count = lines_no_data[lines_no_data>0]

In [None]:
# drop columns on projection
df_unprocessed  = df_unprocessed.drop(columns=[str(year) for year in range(2021,2029)])

In [None]:
# replace no data with nan
df_unprocessed = df_unprocessed.replace("no data", np.nan)

In [None]:
# rename column of country name
df_unprocessed.rename(columns={"GDP current prices (Purchasing power parity billions of international dollars)": "country_name"}, inplace=True)

In [None]:
# create group column to make 
for group, country in dict_groups.items(): 
    df_unprocessed.loc[df_unprocessed["country_name"].isin(country), "group"] = group

In [None]:
# manual fill some missing data
df_unprocessed.loc[8, "group"] = "latin_america_caribbean"
df_unprocessed.loc[40, "group"] = "sub_saharan_africa"
df_unprocessed.loc[128, "group"] = "emerging_developing_europe"
df_unprocessed.loc[128, "country_name"] = 'North Macedonia'

In [None]:
df_unprocessed

In [None]:
df_unprocessed

In [None]:
num_col = df_unprocessed.columns.drop(['country_name', 'group'])
a = df_unprocessed.copy()
a[num_col] = df_unprocessed[num_col].apply(pd.to_numeric, errors='coerce')

In [None]:
a_aggregated = a.groupby('group').sum()

In [None]:
a_aggregated

In [None]:
merged_df = pd.merge(df_unprocessed, a_aggregated, on=['group'])

In [None]:
for year in list(a_aggregated.columns):
    merged_df[f'{year}_x'] = merged_df[f'{year}_x'].astype(float)
    merged_df[f'{year}_y'] = merged_df[f'{year}_y'].astype(float)
    merged_df[f'percentage_gdp_{year}'] = merged_df[f'{year}_x'].divide(merged_df[f'{year}_y'], fill_value=0) * 100

In [None]:
group_data = merged_df[merged_df['group']=='advanced_economies']

In [None]:
std_by_country = group_data[[f'percentage_gdp_{year}' for year in range(1980, 2020)]].std()


In [None]:
columns_to_calculate_std = [f'percentage_gdp_{year}' for year in range(1980, 2020)]
std_by_country = group_data[columns_to_calculate_std].std(axis=1)

In [None]:
columns_to_calculate_std = [f'percentage_gdp_{year}' for year in range(1980, 2020)]
mean_by_country = group_data[columns_to_calculate_std].mean(axis=1)

In [None]:
std_by_country

In [None]:
df_std = pd.DataFrame(columns=['country_name', 'std'])
df_std['country_name'] = group_data['country_name']
df_std['std'] = std_by_country

In [None]:
df_std

In [None]:
columns_to_calculate_std = [f'percentage_gdp_{year}' for year in range(2000, 2020)]

def create_chart_for_group(group_of_countries): 
    # get data 
    num_col = df_unprocessed.columns.drop(['country_name', 'group'])
    a = df_unprocessed.copy()
    a[num_col] = df_unprocessed[num_col].apply(pd.to_numeric, errors='coerce')
    a_aggregated = a.groupby('group').sum()
    merged_df = pd.merge(df_unprocessed, a_aggregated, on=['group'])
    for year in list(a_aggregated.columns):
        merged_df[f'{year}_x'] = merged_df[f'{year}_x'].astype(float)
        merged_df[f'{year}_y'] = merged_df[f'{year}_y'].astype(float)
        merged_df[f'percentage_gdp_{year}'] = merged_df[f'{year}_x'].divide(merged_df[f'{year}_y'], fill_value=0) * 100
    group_data = merged_df[merged_df['group']==group_of_countries]
    fig, ax = plt.subplots(figsize=(10, 6))

    for country in group_data['country_name']:
        percentages = [group_data[f'percentage_gdp_{year}'].loc[group_data['country_name'] == country].iloc[0] for year in range(1996, 2021)
                                                               ]
        ax.plot(range(1996,2021), percentages, label=country)

    ax.legend()
    fig.suptitle(f'percentage of GDP within the group for {group_of_countries.title()}', fontsize=20)
    plt.xlabel('years', fontsize=18)
    plt.ylabel('percentage GDP', fontsize=16)
    plt.show()
    
    # compute variation
    std_by_country = group_data[columns_to_calculate_std].std(axis=1)
    mean_by_country = group_data[columns_to_calculate_std].mean(axis=1)
    df_std = pd.DataFrame(columns=['country_name', 'std', 'mean'])
    df_std['country_name'] = group_data['country_name']
    df_std['std'] = std_by_country
    df_std['mean'] = mean_by_country
    print(df_std)


In [None]:
create_chart_for_group("emerging_developing_europe")

In [None]:
create_chart_for_group("emerging_developing_asia")

In [None]:
create_chart_for_group("latin_america_caribbean")

In [None]:
create_chart_for_group("sub_saharan_africa")

In [None]:
create_chart_for_group("middle_east_central_asia")

## Linear regression on  gdp  per group to compute gdp share per group

# Use data from IMF 

## Preprocess data 

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [None]:
ppp_df = pd.read_csv('ppp_region.csv', sep=";")

In [None]:
ppp_df.drop(0, inplace=True)
ppp_df.drop(columns=[str(year) for year in range(1980, 1992)], inplace=True)
ppp_df.drop(columns=[str(year) for year in range(2024, 2029)], inplace=True)

In [None]:
columns_to_modify = [str(year) for year in range(1992, 2024)]


In [None]:
ppp_df[columns_to_modify] = ppp_df[columns_to_modify].replace(',', '.', regex=True).astype(float)

In [None]:
ppp_df = ppp_df.rename(columns={'GDP current prices (Purchasing power parity billions of international dollars)': 'group_name'})

### create linear model with ppp per region

In [None]:
years_train = [str(year) for year in range(2000, 2023)]
train_data = ppp_df[years_train].transpose()
train_data.columns = ppp_df['group_name'].values
train_data.dropna(inplace=True)
X_train = train_data.index.astype(int).values.reshape(-1,1)
y_train = train_data.values
model = LinearRegression()
model.fit(X_train, y_train)

### Predict on 2023 to 2100, plot the GDP percentage according to model

In [None]:
x_predict = np.arange(2023, 2100).reshape(-1,1)
y_predict = model.predict(x_predict)

In [None]:
percentage_y = y_predict / y_predict.sum(axis = 1, keepdims=True) * 100

In [None]:
# plot estimated gdp percentage per group
fig, ax = plt.subplots(figsize=(10, 6))

for i in range(y_predict.shape[1]):
    plt.plot(x_predict, percentage_y[:, i],label=f'Group {list(dict_groups.keys())[i]}', linestyle='--')
ax.legend()
plt.xlabel('years')
plt.ylabel('percentage GDP')
fig.suptitle(f'Estimated  percentage of GDP per group', fontsize=20)


In [None]:
# get parameters of model
a = model.coef_
b = model.intercept_

### Compute mean share of each country within it group

In [None]:
columns_to_calculate_std = [f'percentage_gdp_{year}' for year in range(2000, 2020)]


In [None]:
df_country_percentage = pd.DataFrame(columns = ['country_name', 'group', 'mean_percentage'])
df_country_percentage['mean_percentage'] = merged_df[columns_to_calculate_std].mean(axis=1) 
df_country_percentage['country_name'] = merged_df['country_name']
df_country_percentage['group'] = merged_df['group']

In [None]:
df_country_percentage

In [None]:
# export dataframe if needed 
