In [1]:
import pandas as pd
import numpy as np 
import os
from tqdm import tqdm

In [2]:
raw_df = pd.read_csv("data/integrated/integrated_data.csv")
raw_df.drop(columns=['rural_safe_water_service', 'urban_safe_water_service',
                     'rural_safe_sanitation_service', 'urban_safe_sanitation_service'], inplace=True)

raw_df['health_expenditure_per_capita'] = raw_df['health_expenditure_per_capita'].astype(float)

raw_df.tail()

Unnamed: 0,Country,location_code,country_code,Type,Year,total_population,male_population,female_population,population_density,life_expectancy_at_birth,...,gdp_growth,gdp_per_capita,inflation_change,total_safe_water_service,total_safe_sanitation_service,health_expenditure_over_gdp,health_expenditure_per_capita,Region,Subregion,Status
4065,Zimbabwe,716,ZWE,Country/Area,2017,14751.101,6940.631,7810.471,38.131,60.709,...,5.2,1481.241,0.9,36.94,36.94,6.36,92.25,AFRICA,Eastern Africa,Developing
4066,Zimbabwe,716,ZWE,Country/Area,2018,15052.184,7086.002,7966.181,38.91,61.414,...,5.0,2529.401,10.6,36.36,36.36,4.67,114.6,AFRICA,Eastern Africa,Developing
4067,Zimbabwe,716,ZWE,Country/Area,2019,15354.608,7231.989,8122.618,39.691,61.292,...,-6.3,1747.346,255.3,35.77,35.77,3.23,54.81,AFRICA,Eastern Africa,Developing
4068,Zimbabwe,716,ZWE,Country/Area,2020,15669.666,7385.22,8284.447,40.506,61.124,...,-7.8,1771.29,557.2,35.19,35.19,2.95,50.68,AFRICA,Eastern Africa,Developing
4069,Zimbabwe,716,ZWE,Country/Area,2021,15993.524,7543.69,8449.834,41.343,59.253,...,8.4,2323.09,98.5,34.61,34.61,2.79,62.74,AFRICA,Eastern Africa,Developing


In [3]:
def fill_missing_values(df):
    """Fills missing values in the DataFrame according to specified priority."""

    indicators = ['gdp', 'inflation_change', 'total_safe_water_service', 'total_safe_sanitation_service',
                  'health_expenditure_over_gdp', 'health_expenditure_per_capita']
    years = sorted(df['Year'].unique())[::-1]  # Get years in descending order

    for country in tqdm(df['Country'].unique()):
        for year in years:
            for indicator in indicators:
                # Step 1: Check next 3 years
                next_3_years = df.loc[(df['Country'] == country) & (df['Year'].between(year + 1, year + 3)) & (df[indicator].notnull())]
                if not next_3_years.empty:
                    closest_year = next_3_years['Year'].min()
                    df.loc[(df['Country'] == country) & (df['Year'] == year) & (df[indicator].isnull()), indicator] = df.loc[(df['Country'] == country) & (df['Year'] == closest_year), indicator].values[0]
                    continue  # Move to next indicator if filled

                # Step 2: Check previous 3 years
                prev_3_years = df.loc[(df['Country'] == country) & (df['Year'].between(year - 3, year - 1)) & (df[indicator].notnull())]
                if not prev_3_years.empty:
                    closest_year = prev_3_years['Year'].max()
                    df.loc[(df['Country'] == country) & (df['Year'] == year) & (df[indicator].isnull()), indicator] = df.loc[(df['Country'] == country) & (df['Year'] == closest_year), indicator].values[0]
                    continue  # Move to next indicator if filled

                # Step 3: Fill with average of countries with same Subregion and Status
                subregion_status_group = df.loc[(df['Subregion'] == df.loc[(df['Country'] == country) & (df['Year'] == year), 'Subregion'].values[0]) & (df['Status'] == df.loc[(df['Country'] == country) & (df['Year'] == year), 'Status'].values[0]) & (df[indicator].notnull())]
                if not subregion_status_group.empty:
                    df.loc[(df['Country'] == country) & (df['Year'] == year) & (df[indicator].isnull()), indicator] = subregion_status_group[indicator].mean()
                    continue  # Move to next indicator if filled

                # Step 4: Fill with average of countries with same Region
                region_group = df.loc[(df['Region'] == df.loc[(df['Country'] == country) & (df['Year'] == year), 'Region'].values[0]) & (df[indicator].notnull())]
                if not region_group.empty:
                    df.loc[(df['Country'] == country) & (df['Year'] == year) & (df[indicator].isnull()), indicator] = region_group[indicator].mean()

    return df

# Example usage (replace with your DataFrame):
df = fill_missing_values(raw_df.copy())  # Fill missing values (create a copy to avoid modifying the original)

  0%|          | 0/185 [00:00<?, ?it/s]

100%|██████████| 185/185 [00:57<00:00,  3.23it/s]


In [4]:
df['gdp_per_capita'] = np.where(df['gdp_per_capita'].isnull(), df['gdp'] / df['total_population'] * 1e6, df['gdp_per_capita'])
df['gdp_growth'] = np.where(df['gdp_growth'].isnull(), 100 * (df['gdp'] - df['gdp'].shift(1)) / df['gdp'].shift(1), df['gdp_growth'])
df.fillna(0, inplace=True)

In [5]:
df.to_csv("data/integrated/integrated_data_fillna.csv", index=False)

In [6]:
df['Country'].unique().shape

(185,)

In [7]:
df

Unnamed: 0,Country,location_code,country_code,Type,Year,total_population,male_population,female_population,population_density,life_expectancy_at_birth,...,gdp_growth,gdp_per_capita,inflation_change,total_safe_water_service,total_safe_sanitation_service,health_expenditure_over_gdp,health_expenditure_per_capita,Region,Subregion,Status
0,Afghanistan,4,AFG,Country/Area,2000,19542.982,9815.442,9727.541,30.099,55.298,...,,223.456175,5.1,20.97,20.97,9.44,17.01,ASIA,Southern Asia,Developing
1,Afghanistan,4,AFG,Country/Area,2001,19688.632,9895.467,9793.166,30.323,55.798,...,0.0,221.803120,5.1,20.98,20.98,9.44,17.01,ASIA,Southern Asia,Developing
2,Afghanistan,4,AFG,Country/Area,2002,21000.256,10562.202,10438.055,32.343,56.454,...,0.0,233.433000,5.1,22.54,22.54,9.44,17.01,ASIA,Southern Asia,Developing
3,Afghanistan,4,AFG,Country/Area,2003,22645.130,11397.483,11247.647,34.876,57.344,...,8.7,233.755000,35.7,24.10,24.10,8.94,17.81,ASIA,Southern Asia,Developing
4,Afghanistan,4,AFG,Country/Area,2004,23553.551,11862.726,11690.825,36.276,57.944,...,0.7,254.259000,16.4,25.67,25.67,9.81,21.43,ASIA,Southern Asia,Developing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4065,Zimbabwe,716,ZWE,Country/Area,2017,14751.101,6940.631,7810.471,38.131,60.709,...,5.2,1481.241000,0.9,36.94,36.94,6.36,92.25,AFRICA,Eastern Africa,Developing
4066,Zimbabwe,716,ZWE,Country/Area,2018,15052.184,7086.002,7966.181,38.910,61.414,...,5.0,2529.401000,10.6,36.36,36.36,4.67,114.60,AFRICA,Eastern Africa,Developing
4067,Zimbabwe,716,ZWE,Country/Area,2019,15354.608,7231.989,8122.618,39.691,61.292,...,-6.3,1747.346000,255.3,35.77,35.77,3.23,54.81,AFRICA,Eastern Africa,Developing
4068,Zimbabwe,716,ZWE,Country/Area,2020,15669.666,7385.220,8284.447,40.506,61.124,...,-7.8,1771.290000,557.2,35.19,35.19,2.95,50.68,AFRICA,Eastern Africa,Developing
