In [None]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from sklearn import preprocessing
from functools import reduce
import warnings; warnings.simplefilter('ignore')

In [None]:
# Data cleaning function

def Country_Swap(target, acquirer):
    """Makes initial replacements of certain terms for eventual fuzzy matching and then fuzzy matches country names to improve dataframe merge results."""
    Swap_Out = ['Korea (Rep. of)',
                'Democratic',
                'Republic',
                'Czechia',
                'Slovakia',
                'West Bank and Gaza',
                'Kyrgyzstan',
                'T.F.Y.R. Macedonia',
                'Brunei Darussalam',
                'Syrian Arab Rep.']
    Swap_In = ['Korea',
               'Dem.',
               'Rep.',
               'Czech Rep.',
               'Slovak Rep.',
               'Palestine',
               'Kyrgyz Rep.',
               'North Macedonia',
               'Brunei',
               'Syria']
    for i in Swap_Out:
        acquirer['Country'] = acquirer['Country'].str.replace(i, Swap_In[Swap_Out.index(i)])
        target['Country'] = target['Country'].str.replace(i, Swap_In[Swap_Out.index(i)])
        
    banned_list = ['Australia', 'Austria', 'Iceland', 'Ireland', 'Congo', 'China']
    target_list = list(target['Country'].unique())
    acquirer_list = list(acquirer['Country'].unique())
    
    for i in target_list:
        for j in acquirer_list:
            if i not in acquirer_list:
                if (fuzz.ratio(i,j) >= 80) and (i not in banned_list) and (j not in banned_list):
                    target['Country'] = target['Country'].replace(to_replace = i, value = j)
                elif (fuzz.partial_ratio(i,j) >= 90) and (i not in banned_list) and (j not in banned_list):
                    target['Country'] = target['Country'].replace(to_replace = i, value = j)
                elif (fuzz.token_set_ratio(i,j) >= 90) and (i not in banned_list) and (j not in banned_list):
                    target['Country'] = target['Country'].replace(to_replace = i, value = j)
    return target, acquirer

In [None]:
# Importing dataframes relating to GDP, internet use, emissions and population.

gdp_and_internet_use = pd.read_csv('../data/gdp_and_internet_use_to_be_imported_and_merged.csv')
emissions = pd.read_csv("../data/CO2 Emissions.csv", names = ['Country', 'Year', 'CO2 (kts) Emitted'], header = 0)
population = pd.read_csv("../data/Population 1990 - 2020.csv")

In [None]:
population['Value'] = population['Value'].astype(int)
population.rename(columns = {"Year(s)": "Year", "Country or Area": "Country"}, inplace = True)
population = population.drop(axis = 1, columns = 'Variant')
population = population[~population.Country.isin(["United States Virgin Islands", "Australia/New Zealand", "Oceania (excluding Australia and New Zealand)"])]

In [None]:
emissions, gdp_and_internet_use = Country_Swap(emissions, gdp_and_internet_use)
gaiu_emissions = pd.merge(gdp_and_internet_use, 
                          emissions, 
                          how = 'inner').rename(columns = {'GDP_Per_Capita':'GDP Per Capita', 
                                                           'Internet_Use_Pct':'% Internet Use'})

In [None]:
population, gaiu_emissions = Country_Swap(population, gaiu_emissions)
gaiu_emissions = pd.merge(gaiu_emissions, population, on = ["Country","Year"], how = 'inner')
gaiu_emissions["CO2 (kts) Emitted Per 1000"] = gaiu_emissions["CO2 (kts) Emitted"]/(gaiu_emissions["Value"])

In [None]:
gaiu_emissions_2014 = gaiu_emissions[gaiu_emissions['Year'] == 2014]
gaiu_emissions_1990 = gaiu_emissions[gaiu_emissions['Year'] == 1990]

In [None]:
relevant_variables = ['GDP Per Capita', '% Internet Use', 'CO2 (kts) Emitted', 'CO2 (kts) Emitted Per 1000']

min_max_scaler = preprocessing.MinMaxScaler()
for i in relevant_variables:
    gaiu_emissions_2014[i] = min_max_scaler.fit_transform(gaiu_emissions_2014[[i]])
    
min_max_scaler = preprocessing.MinMaxScaler()
for i in relevant_variables:
    gaiu_emissions_1990[i] = min_max_scaler.fit_transform(gaiu_emissions_1990[[i]])
    
gaiu_emissions_1990_2014 = pd.concat([gaiu_emissions_1990, gaiu_emissions_2014])

In [None]:
gaiu_emissions.to_csv('../data/gaiu_emissions_to_be_imported.csv', index = False)
gaiu_emissions_1990_2014.to_csv('../data/gaiu_emissions_1990_2014_to_be_imported.csv', index = False)

In [None]:
# Adding income inequality measures to GDP per capita dataframe

def Take_Out_the_NAs_On_Read(values):
    """Drops NA values on csv read."""
    if values == np.nan:
        return 0
    return values

gdp_df = pd.read_csv('../data/gdp_df_to_be_imported_and_merged.csv')
merged_df_1 = pd.read_csv("../data/Top Ten Income Share.csv", skipfooter = 253, usecols = [i for i in range(3)])
merged_df_2 = pd.read_csv("../data/Poverty Headcount $5.50.csv", skipfooter = 260, usecols = [i for i in range(3)], converters={"Value": Take_Out_the_NAs_On_Read})
merged_df_3 = pd.read_csv("../data/Poverty Headcount $1.90.csv", skipfooter = 261, usecols = [i for i in range(3)], converters={"Value": Take_Out_the_NAs_On_Read})

In [None]:
to_be_merged_dataframes = [gdp_df, merged_df_1, merged_df_2, merged_df_3]

for i in to_be_merged_dataframes:
    if not i.equals(gdp_df):
        i.rename(columns = {"Country or Area":"Country"}, inplace = True)
        i['Year'] = i['Year'].astype(int)
        i['Value'] = i['Value'].astype(np.float)
        Country_Swap(i, gdp_df)

In [None]:
merged_df_1.rename(columns = {"Value":"% Income Held by Top 10%"}, inplace = True)
merged_df_2.rename(columns = {"Value":"% Population Living on <$5.50/Day"}, inplace = True)
merged_df_3.rename(columns = {"Value":"% Population Living on <$1.90/Day"}, inplace = True)

In [None]:
fully_merged = reduce(lambda left, right: 
                      pd.merge(left, right, on = ['Country', 'Year'], how = 'inner'),
                      to_be_merged_dataframes)

In [None]:
US_Sanctions_List = ['North Macedonia', 
                     'Yugoslavia', 
                     'Serbia', 
                     'Belarus', 
                     'Burma', 
                     'Burundi', 
                     'Central African Rep.', 
                     'Cuba', 
                     'Dem. Rep. of Congo', 
                     'Hong Kong', 
                     'Iran', 
                     'Iraq', 
                     'Lebanon', 
                     'Libya', 
                     'Mali', 
                     'Nicaragua', 
                     'North Korea', 
                     'Somalia', 
                     'Sudan', 
                     'South Sudan', 
                     'Syria', 
                     'Crimea', 
                     'Russia', 
                     'Venezuela', 
                     'Yemen', 
                     'Zimbabwe']
fully_merged['US Sanctions'] = False
fully_merged['US Sanctions'].loc[fully_merged['Country'].isin([i for i in US_Sanctions_List])] = True

In [None]:
fully_merged.to_csv('../data/fully_merged_to_be_imported.csv', index = False)