In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import re

In [107]:
# reading the datasets from the directory Datasets/SocioEconomic
# read the pop_stab dataframe up to 214 rows only
pop_stab =  pd.read_csv('../Datasets/SocioEconomic/political_stability_1996_2021.csv', nrows=214)
china_hdi_le =  pd.read_csv('../Datasets/SocioEconomic/China_HDI_LE1990_2021.csv')
s_korea_hdi_le = pd.read_csv('../Datasets/SocioEconomic/SKorea_HDI_LE_1990_2021.csv')
us_hdi_le =  pd.read_csv('../Datasets/SocioEconomic/US_HDI_LE_1990_2021.csv')
pop =  pd.read_csv('../Datasets/SocioEconomic/population_1960_2022.csv')
urban_pop =  pd.read_csv('../Datasets/SocioEconomic/Urban_pop_WorldBank_1960_2021.csv')
gdp =  pd.read_csv('../Datasets/SocioEconomic/GDP_1960_2020.csv')
oil_prices = pd.read_csv('../Datasets/SocioEconomic/crude_oil_price_1983_2023.csv')
inflation =  pd.read_csv('../Datasets/SocioEconomic/Inflation_data_1970_2022.csv', encoding='ISO-8859-1')

In [109]:
# defining a list of countries
countries_of_interest = ['USA', 'CHN', 'KOR']
# get all the rows for each country of interest
pop_stab = pop_stab[pop_stab['Country Code'].isin(countries_of_interest)]
# renaming the columns by removing [YR]
pop_stab = pop_stab.rename(columns={col: re.sub(r'\s*\[YR.*\]', '', col) for col in pop_stab.columns[4:]})
# Change the type of year columns to float
for col in pop_stab.columns[4:]:
    pop_stab[col] = pop_stab[col].astype(float)

In [95]:
pop_stab.columns

Index(['Country Name', 'Country Code', 'Series Name', 'Series Code', '1996',
       '1998', '2000', '2002', '2003', '2004', '2005', '2006', '2007', '2008',
       '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021'],
      dtype='object')

In [110]:
# As you can see above, certain years are missing from the columns
missing_years = [1997, 1999, 2001]

In [111]:
def insert_missing_years(df, missing_years):
    for year in missing_years:
        prev_year = str(year - 1)
        next_year = str(year + 1)

        # Check if previous and next year columns exist
        if prev_year in df.columns and next_year in df.columns:
            df[prev_year] = pd.to_numeric(df[prev_year], errors='coerce')
            df[next_year] = pd.to_numeric(df[next_year], errors='coerce')
            df[str(year)] = (df[prev_year] + df[next_year]) / 2
        else:
            print(f"Cannot compute for year {year}, adjacent years data are not available.")
        
    # Sort the columns
    non_year_cols = ["Country Name", "Country Code", "Series Name", "Series Code"]
    year_cols = [str(i) for i in sorted(int(col) for col in df.columns if col.isdigit())]
    cols = non_year_cols + year_cols
    
    return df.reindex(columns=cols)

pop_stab = insert_missing_years(pop_stab, missing_years)


In [112]:
pop_stab.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 40 to 203
Data columns (total 30 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country Name  3 non-null      object 
 1   Country Code  3 non-null      object 
 2   Series Name   3 non-null      object 
 3   Series Code   3 non-null      object 
 4   1996          3 non-null      float64
 5   1997          3 non-null      float64
 6   1998          3 non-null      float64
 7   1999          3 non-null      float64
 8   2000          3 non-null      float64
 9   2001          3 non-null      float64
 10  2002          3 non-null      float64
 11  2003          3 non-null      float64
 12  2004          3 non-null      float64
 13  2005          3 non-null      float64
 14  2006          3 non-null      float64
 15  2007          3 non-null      float64
 16  2008          3 non-null      float64
 17  2009          3 non-null      float64
 18  2010          3 non-null      f