In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../datasets/merged_df_mon.csv')
df.head()

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
0,Afghanistan,AFG,74.75,99.169998,58.06758,11.792727,-314602.0,0.1,638.0,64.8,...,65.929134,43.448181,0.945227,27.710843,8.0,9.0,,91.89,55.59555,0.509
1,Angola,AGO,35.161,103.419998,45.237371,3.930376,32066.0,1.9,241.0,80.6,...,44.686107,10.799292,2.507985,30.454545,36.0,36.0,,55.5,23.916555,0.582
2,Albania,ALB,40.617,103.980003,42.670839,22.149635,-69998.0,0.1,15.0,9.4,...,125.710352,443.020376,1.108696,27.857143,5.0,5.0,16.0,149.54,104.870693,0.792
3,Andorra,AND,11.85,,40.042553,1.744681,,,,3.0,...,104.332411,3415.717495,,32.142857,,,,2.01,163.823404,0.867
4,United Arab Emirates,ARE,13.752,104.18,5.404112,0.626584,200000.0,0.1,3.0,7.2,...,208.9786,1285.099111,,22.5,8.0,9.0,63.0,2899.67,133.58499,0.889


### Checking Null Values

In [3]:
df.isnull().sum()

Country Name                    0
Country Code                    0
rural_pop_percent_17            1
food_production_index_17        2
ag_land_area_17                 1
arable_land_percent_2017        2
net_migration_2017              6
hiv_prevalence_2017            55
mat_mortality_ratio_2017        7
under5_mortality_ratio_2017     2
tubercul_incidence_2017         1
elec_access_2017                1
ren_energy_percent_2017         0
co2_emissions_2017              2
pop_air_pollution_2017          4
foreign_dir_inv_2017            4
atm_access_2017                17
adol_fertility_rate_2017        6
fem_labor_part_rate_2017       11
male_labor_part_rate_2017      11
fertility_rate_2017             5
dpt_immuniz_rate_2017           2
undernourished_rate_2017       33
cell_subscriptions_per100       1
internet_per_mil_2017           0
military_exp_2017              37
women_seats_percent_2017        5
male_bus_start_2017             4
female_bus_start_2017           4
patent_apps_20

## Data Cleaning

#### 1. Rural Population %

In [4]:
df['rural_pop_percent_17'].describe()

count    186.000000
mean      41.836919
std       23.193779
min        0.000000
25%       22.436000
50%       41.837500
75%       60.423250
max       87.294000
Name: rural_pop_percent_17, dtype: float64

In [5]:
df[df['rural_pop_percent_17'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
50,Eritrea,ERI,,101.370003,75.168317,6.831683,-199290.0,0.6,480.0,43.3,...,20.363955,0.0,,22.0,84.0,84.0,,13.65,33.79103,0.456


Eritrea is the only country where data is unavailable. Based on geographic peers and our own estimations, we are going to assign Eritrea a value in line with the 75% value.

In [6]:
df.loc[df['Country Name'] == 'Eritrea', ['rural_pop_percent_17']] = df['rural_pop_percent_17'].quantile(0.75)

#### 2. Food Production Index

In [7]:
df['food_production_index_17'].describe()

count    185.000000
mean     103.801459
std        8.490092
min       79.320000
25%       99.000000
50%      102.830002
75%      107.400002
max      139.860001
Name: food_production_index_17, dtype: float64

In [8]:
df[df['food_production_index_17'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
3,Andorra,AND,11.85,,40.042553,1.744681,,,,3.0,...,104.332411,3415.717495,,32.142857,,,,2.01,163.823404,0.867
135,Palau,PLW,20.635,,9.347826,0.652174,,,,18.6,...,,224.605537,,12.5,28.0,28.0,,5.96,38.715217,0.822


Palau and Andorra are the countries where data is unavailable. Based on geographic peers and our own estimations, we are going to assign both countries the median value of the metric as we do not believe that either countries will have a particularly extreme value 

In [9]:
df.loc[df['Country Name'] == 'Andorra', ['food_production_index_17']] = df['food_production_index_17'].median()
df.loc[df['Country Name'] == 'Palau', ['food_production_index_17']] = df['food_production_index_17'].median()

#### 3. Agricultural Land Area %

In [10]:
df['ag_land_area_17'].describe()

count    186.000000
mean      38.996556
std       21.473358
min        0.538462
25%       22.643333
50%       39.969657
75%       54.616763
max       80.769413
Name: ag_land_area_17, dtype: float64

In [11]:
df[df['ag_land_area_17'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
140,West Bank and Gaza,PSE,24.106,105.870003,,,-52816.0,,27.0,18.1,...,84.179796,547.723189,,,43.0,44.0,,324.96,740.000831,0.708


West Bank and Gaza is the only country where data is unavailable. Based on our own estimations, we are going to assign West Bank and Gaza a value in line with the 25% value.

In [12]:
df.loc[df['Country Name'] == 'West Bank and Gaza', ['ag_land_area_17']] = df['ag_land_area_17'].quantile(0.25)

#### 4. Arable Land Area %

In [13]:
df['arable_land_percent_2017'].describe()

count    185.000000
mean      14.672965
std       13.633534
min        0.086281
25%        4.129606
50%       10.985332
75%       21.929825
max       59.593839
Name: arable_land_percent_2017, dtype: float64

In [14]:
df[df['arable_land_percent_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
140,West Bank and Gaza,PSE,24.106,105.870003,22.643333,,-52816.0,,27.0,18.1,...,84.179796,547.723189,,,43.0,44.0,,324.96,740.000831,0.708
153,South Sudan,SSD,80.654,102.809998,45.151463,,-870998.0,2.2,1150.0,97.9,...,25.561301,0.183305,2.354538,28.45953,13.0,13.0,,6.4,17.265411,0.429


West Bank and Gaza and South Sudan are the only countries where data is unavailable. Based on our own estimations, we are going to assign West Bank and Gaza a value in line with the 25% value; while South Sudan will be assigned a value in line with the median value

In [15]:
df.loc[df['Country Name'] == 'West Bank and Gaza', ['arable_land_percent_2017']] = df['arable_land_percent_2017'].quantile(0.25)
df.loc[df['Country Name'] == 'South Sudan', ['arable_land_percent_2017']] = df['arable_land_percent_2017'].median()

#### 5. Net Migration

In [16]:
df['net_migration_2017'].describe()

count    1.810000e+02
mean     4.013564e+03
std      6.483328e+05
min     -3.266243e+06
25%     -5.665800e+04
50%     -4.999000e+03
75%      5.600000e+04
max      4.774029e+06
Name: net_migration_2017, dtype: float64

In [17]:
df[df['net_migration_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
3,Andorra,AND,11.85,102.830002,40.042553,1.744681,,,,3.0,...,104.332411,3415.717495,,32.142857,,,,2.01,163.823404,0.867
45,Dominica,DMA,29.819,99.660004,33.333333,8.0,,,,33.5,...,105.278625,6968.933669,,25.0,12.0,12.0,,12.5,95.28,0.738
91,St. Kitts and Nevis,KNA,69.227,94.040001,23.076923,19.230769,,,,16.2,...,147.714478,3113.229303,,13.333333,18.5,18.5,,28.32,200.138462,0.773
99,Liechtenstein,LIE,85.685,99.01,32.25,13.5,,,,,...,122.756614,23912.180928,,12.0,,,,33.89,236.28125,0.919
110,Marshall Islands,MHL,23.366,126.669998,47.777778,11.111111,,,,33.5,...,27.558648,103.353832,,9.090909,17.0,17.0,,0.72,322.516667,0.702
135,Palau,PLW,20.635,102.830002,9.347826,0.652174,,,,18.6,...,,224.605537,,12.5,28.0,28.0,,5.96,38.715217,0.822


Andorra, Dominica, St Kitts and Nevis, Liechtenstein, Marshall Islands and Palau are the countries where data is unavailable. Based on our own estimations, we are going to assign al countries a value in line with the median value. We do not see any reason why any of the countries would be seeing higher or lower than expected net migration.

In [18]:
df.loc[df['Country Name'] == 'Andorra', ['net_migration_2017']] = df['net_migration_2017'].median()
df.loc[df['Country Name'] == 'Dominica', ['net_migration_2017']] = df['net_migration_2017'].median()
df.loc[df['Country Name'] == 'St. Kitts and Nevis', ['net_migration_2017']] = df['net_migration_2017'].median()
df.loc[df['Country Name'] == 'Liechtenstein', ['net_migration_2017']] = df['net_migration_2017'].median()
df.loc[df['Country Name'] == 'Marshall Islands', ['net_migration_2017']] = df['net_migration_2017'].median()
df.loc[df['Country Name'] == 'Palau', ['net_migration_2017']] = df['net_migration_2017'].median()

#### 6. HIV Prevalence

In [19]:
df['hiv_prevalence_2017'].describe()

count    132.000000
mean       1.935606
std        4.489515
min        0.100000
25%        0.175000
50%        0.400000
75%        1.300000
max       28.500000
Name: hiv_prevalence_2017, dtype: float64

In [20]:
df[df['hiv_prevalence_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
3,Andorra,AND,11.85,102.830002,40.042553,1.744681,-4999.0,,,3.0,...,104.332411,3415.717495,,32.142857,,,,2.01,163.823404,0.867
7,Antigua and Barbuda,ATG,75.287,101.589996,20.454545,9.090909,0.0,,42.0,7.1,...,192.819567,890.751899,,11.111111,22.0,22.0,,5.2,216.875,0.772
9,Austria,AUT,41.906,98.519997,32.169413,16.10446,324998.0,,5.0,3.6,...,123.119296,7431.714636,0.757321,34.42623,21.0,21.0,2073.0,12850.61,106.611318,0.921
12,Belgium,BEL,2.039,101.910004,43.824306,27.608983,240000.0,,5.0,4.1,...,99.453035,8300.544045,0.883681,38.0,4.5,4.5,1001.0,16278.27,375.66572,0.93
15,Bangladesh,BGD,64.142,107.860001,70.579473,59.593839,-1847503.0,,173.0,34.2,...,94.526473,65.20946,1.240256,20.285714,19.5,19.5,61.0,2990.72,1226.745187,0.625
16,Bulgaria,BGR,25.331,106.230003,46.329403,32.139646,-24001.0,,10.0,7.3,...,120.140447,32308.325656,1.226244,23.75,23.0,23.0,202.0,2808.03,65.180057,0.813
17,Bahrain,BHR,10.814,122.360001,11.025641,2.051282,239000.0,,14.0,7.3,...,158.256809,359.419227,4.338775,7.5,8.0,9.0,8.0,259.34,1915.483333,0.852
19,Bosnia and Herzegovina,BIH,52.124,94.699997,43.515625,20.703125,-107926.0,,10.0,6.1,...,102.642379,1499.015078,0.864183,21.428571,80.0,80.0,87.0,724.8,65.459648,0.777
24,Barbados,BRB,68.841,103.260002,23.255814,16.27907,-397.0,,27.0,13.4,...,115.139118,768.615339,,16.666667,16.0,16.0,,48.84,665.648837,0.81
25,Brunei Darussalam,BRN,22.688,98.529999,2.732448,0.948767,0.0,,31.0,11.0,...,128.331366,1620.802816,2.86517,9.090909,12.0,13.0,8.0,214.24,80.546679,0.836


The countries above are the countries where data is unavailable. Based on our own estimations, we are going to assign all countries a value in line with the median value.

In [21]:
df['hiv_prevalence_2017'].fillna(df['hiv_prevalence_2017'].median(), inplace=True)

#### 7. Maternal Mortality Rate

In [22]:
df['mat_mortality_ratio_2017'].describe()

count     180.000000
mean      158.272222
std       229.808733
min         2.000000
25%        12.000000
50%        52.500000
75%       188.250000
max      1150.000000
Name: mat_mortality_ratio_2017, dtype: float64

In [23]:
df[df['mat_mortality_ratio_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
3,Andorra,AND,11.85,102.830002,40.042553,1.744681,-4999.0,0.4,,3.0,...,104.332411,3415.717495,,32.142857,,,,2.01,163.823404,0.867
45,Dominica,DMA,29.819,99.660004,33.333333,8.0,-4999.0,0.4,,33.5,...,105.278625,6968.933669,,25.0,12.0,12.0,,12.5,95.28,0.738
70,"Hong Kong SAR, China",HKG,0.0,98.709999,4.857143,2.952381,146542.0,0.4,,,...,251.765006,10482.740897,,,1.5,1.5,324.0,,7041.142857,0.946
91,St. Kitts and Nevis,KNA,69.227,94.040001,23.076923,19.230769,-4999.0,0.4,,16.2,...,147.714478,3113.229303,,13.333333,18.5,18.5,,28.32,200.138462,0.773
99,Liechtenstein,LIE,85.685,99.01,32.25,13.5,-4999.0,0.4,,,...,122.756614,23912.180928,,12.0,,,,33.89,236.28125,0.919
110,Marshall Islands,MHL,23.366,126.669998,47.777778,11.111111,-4999.0,0.4,,33.5,...,27.558648,103.353832,,9.090909,17.0,17.0,,0.72,322.516667,0.702
135,Palau,PLW,20.635,102.830002,9.347826,0.652174,-4999.0,0.4,,18.6,...,,224.605537,,12.5,28.0,28.0,,5.96,38.715217,0.822


The countries above are the set of countries where data is unavailable. Based on our own estimations, we are going to assign Dominica, St Kitts and Nevis, Marshall Islands and Palau values in line with the median value; while Andorra, Hong Kong and Liechtenstein will be assigned a values in line with the 25% value.

In [24]:
df['mat_mortality_ratio_2017'].fillna(df['mat_mortality_ratio_2017'].median(), inplace=True)
df.loc[df['Country Name'] == 'Andorra', ['mat_mortality_ratio_2017']] = df['mat_mortality_ratio_2017'].quantile(0.25)
df.loc[df['Country Name'] == 'Hong Kong SAR, China', ['mat_mortality_ratio_2017']] = df['mat_mortality_ratio_2017'].quantile(0.25)
df.loc[df['Country Name'] == 'Liechtenstein', ['mat_mortality_ratio_2017']] = df['mat_mortality_ratio_2017'].quantile(0.25)

#### 8. Mortality Rate for Under 5 year olds

In [25]:
df['under5_mortality_ratio_2017'].describe()

count    185.000000
mean      29.263243
std       29.366660
min        2.100000
25%        7.200000
50%       16.800000
75%       45.900000
max      122.500000
Name: under5_mortality_ratio_2017, dtype: float64

In [26]:
df[df['under5_mortality_ratio_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
70,"Hong Kong SAR, China",HKG,0.0,98.709999,4.857143,2.952381,146542.0,0.4,13.25,,...,251.765006,10482.740897,,,1.5,1.5,324.0,,7041.142857,0.946
99,Liechtenstein,LIE,85.685,99.01,32.25,13.5,-4999.0,0.4,13.125,,...,122.756614,23912.180928,,12.0,,,,33.89,236.28125,0.919


Hong Kong and Liechtenstein are the two countries where data is unavailable. Based on our own estimations, we are going to assign both countries a value in line with the 25th percentile value.

In [27]:
df.loc[df['Country Name'] == 'Hong Kong SAR, China', ['under5_mortality_ratio_2017']] = df['under5_mortality_ratio_2017'].quantile(0.25)
df.loc[df['Country Name'] == 'Liechtenstein', ['under5_mortality_ratio_2017']] = df['under5_mortality_ratio_2017'].quantile(0.25)

#### 9. Incidence of Tuberculosis

In [28]:
df['tubercul_incidence_2017'].describe()

count    186.000000
mean     112.058387
std      147.450609
min        0.000000
25%       12.000000
50%       46.500000
75%      156.000000
max      738.000000
Name: tubercul_incidence_2017, dtype: float64

In [29]:
df[df['tubercul_incidence_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
99,Liechtenstein,LIE,85.685,99.01,32.25,13.5,-4999.0,0.4,13.125,7.2,...,122.756614,23912.180928,,12.0,,,,33.89,236.28125,0.919


Liechtenstein is the countries where data is unavailable. Based on our own estimations, we are going to assign Liechtenstein a value in line with the 25th percentile value.

In [30]:
df.loc[df['Country Name'] == 'Liechtenstein', ['tubercul_incidence_2017']] = df['tubercul_incidence_2017'].quantile(0.25)

#### 9. Access to Electricity

In [31]:
df['elec_access_2017'].describe()

count    186.000000
mean      41.836919
std       23.193779
min        0.000000
25%       22.436000
50%       41.837500
75%       60.423250
max       87.294000
Name: elec_access_2017, dtype: float64

In [32]:
df[df['elec_access_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
50,Eritrea,ERI,60.42325,101.370003,75.168317,6.831683,-199290.0,0.6,480.0,43.3,...,20.363955,0.0,,22.0,84.0,84.0,,13.65,33.79103,0.456


Eritrea is the only country where data is unavailable. Based on geographic peers and our own estimations, we are going to assign Eritrea a value in line with the 25% value.

In [33]:
df.loc[df['Country Name'] == 'Eritrea', ['elec_access_2017']] = df['elec_access_2017'].quantile(0.25)

#### 11. Carbon Dioxide Emissions

In [34]:
df['co2_emissions_2017'].describe()

count    185.000000
mean       4.264187
std        4.920289
min        0.035013
25%        0.791246
50%        2.620384
75%        5.921351
max       32.127990
Name: co2_emissions_2017, dtype: float64

In [35]:
df[df['co2_emissions_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
70,"Hong Kong SAR, China",HKG,0.0,98.709999,4.857143,2.952381,146542.0,0.4,13.25,7.2,...,251.765006,10482.740897,,,1.5,1.5,324.0,,7041.142857,0.946
140,West Bank and Gaza,PSE,24.106,105.870003,22.643333,4.129606,-52816.0,0.4,27.0,18.1,...,84.179796,547.723189,,,43.0,44.0,,324.96,740.000831,0.708


Hong Kong and West Bank and Gaza are the two countries where data is unavailable. Based on our own estimations, we are going to assign both countries a value in line with the 75th percentile value.

In [36]:
df.loc[df['Country Name'] == 'Hong Kong SAR, China', ['co2_emissions_2017']] = df['co2_emissions_2017'].quantile(0.75)
df.loc[df['Country Name'] == 'West Bank and Gaza', ['co2_emissions_2017']] = df['co2_emissions_2017'].quantile(0.75)

#### 12. Air Polution

In [37]:
df['pop_air_pollution_2017'].describe()

count    183.000000
mean      27.281160
std       18.581114
min        5.861331
25%       14.183685
50%       21.767721
75%       35.644394
max       99.734374
Name: pop_air_pollution_2017, dtype: float64

In [38]:
df[df['pop_air_pollution_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
70,"Hong Kong SAR, China",HKG,0.0,98.709999,4.857143,2.952381,146542.0,0.4,13.25,7.2,...,251.765006,10482.740897,,,1.5,1.5,324.0,,7041.142857,0.946
91,St. Kitts and Nevis,KNA,69.227,94.040001,23.076923,19.230769,-4999.0,0.4,52.5,16.2,...,147.714478,3113.229303,,13.333333,18.5,18.5,,28.32,200.138462,0.773
99,Liechtenstein,LIE,85.685,99.01,32.25,13.5,-4999.0,0.4,13.125,7.2,...,122.756614,23912.180928,,12.0,,,,33.89,236.28125,0.919
135,Palau,PLW,20.635,102.830002,9.347826,0.652174,-4999.0,0.4,52.5,18.6,...,,224.605537,,12.5,28.0,28.0,,5.96,38.715217,0.822


The countries above are those countries where data is unavailable. Based on our own estimations, we are going to assign all countries a value in line with the 25th percentile value.

In [39]:
df['pop_air_pollution_2017'].fillna(df['pop_air_pollution_2017'].quantile(.25), inplace=True)

#### 13. Foreign Direct Investment

In [40]:
df['foreign_dir_inv_2017'].describe()

count    1.830000e+02
mean     1.171300e+10
std      4.004695e+10
min     -3.728884e+10
25%      1.641025e+08
50%      9.983445e+08
75%      4.321268e+09
max      3.808230e+11
Name: foreign_dir_inv_2017, dtype: float64

In [41]:
df[df['foreign_dir_inv_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
3,Andorra,AND,11.85,102.830002,40.042553,1.744681,-4999.0,0.4,13.5,3.0,...,104.332411,3415.717495,,32.142857,,,,2.01,163.823404,0.867
40,Cuba,CUB,23.023,103.900002,60.693642,28.021195,-72000.0,0.4,36.0,5.4,...,40.688585,8.466165,2.872403,48.856209,,,29.0,1117.6,109.241378,0.781
57,"Micronesia, Fed. Sts.",FSM,77.392,101.050003,31.428571,2.857143,-2999.0,0.4,88.0,27.0,...,20.73767,161.491463,,0.0,16.0,16.0,,6.58,159.23,0.618
161,Syrian Arab Republic,SYR,46.5,107.620003,75.810053,25.388008,-2136954.0,0.1,31.0,23.5,...,89.999355,13.278217,,13.2,15.0,16.0,120.0,263.85,93.098453,0.563


The countries above are those countries where data is unavailable. Based on our own estimations, we are going to assign all countries a value in line with the 25th percentile value.

In [42]:
df['foreign_dir_inv_2017'].fillna(df['foreign_dir_inv_2017'].quantile(.25), inplace=True)

#### 14. ATM Access

In [43]:
df['atm_access_2017'].describe()

count    170.000000
mean      49.053235
std       44.216273
min        0.510000
25%       12.017500
50%       39.555000
75%       67.512500
max      272.590000
Name: atm_access_2017, dtype: float64

In [44]:
df[df['atm_access_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
3,Andorra,AND,11.85,102.830002,40.042553,1.744681,-4999.0,0.4,13.5,3.0,...,104.332411,3415.717495,,32.142857,,,,2.01,163.823404,0.867
11,Burundi,BDI,87.294,111.919998,79.166667,46.728972,10003.0,1.2,548.0,61.5,...,54.683676,4.61808,1.870652,36.363636,4.0,4.0,3.0,13.19,421.612539,0.431
17,Bahrain,BHR,10.814,122.360001,11.025641,2.051282,239000.0,0.4,14.0,7.3,...,158.256809,359.419227,4.338775,7.5,8.0,9.0,8.0,259.34,1915.483333,0.852
40,Cuba,CUB,23.023,103.900002,60.693642,28.021195,-72000.0,0.4,36.0,5.4,...,40.688585,8.466165,2.872403,48.856209,,,29.0,1117.6,109.241378,0.781
50,Eritrea,ERI,60.42325,101.370003,75.168317,6.831683,-199290.0,0.6,480.0,43.3,...,20.363955,0.0,,22.0,84.0,84.0,,13.65,33.79103,0.456
53,Ethiopia,ETH,79.69,105.970001,33.242509,14.141046,150002.0,1.0,401.0,55.9,...,37.218072,0.714286,0.666369,38.756856,33.0,33.0,14.0,1657.99,94.21926,0.478
58,Gabon,GAB,11.024,103.809998,8.541002,1.261303,16301.0,3.4,252.0,45.5,...,128.981661,28.089724,1.81049,17.094017,33.0,33.0,,62.92,8.013397,0.697
90,Kiribati,KIR,46.738,101.050003,41.975309,2.469136,-3999.0,0.4,92.0,54.5,...,40.402775,26.280518,,6.521739,31.0,31.0,,2.54,140.92963,0.628
99,Liechtenstein,LIE,85.685,99.01,32.25,13.5,-4999.0,0.4,13.125,7.2,...,122.756614,23912.180928,,12.0,,,,33.89,236.28125,0.919
100,Sri Lanka,LKA,81.616,83.18,44.596319,21.33036,-489932.0,0.1,36.0,7.8,...,133.467627,305.21358,2.119697,5.777778,9.0,9.0,277.0,1181.27,346.468906,0.779


The countries above are those countries where data is unavailable. Based on our own estimations, we are going to assign all countries a value in line with the 25th percentile value. However, the following countries will have values in line with the 75% value: Andorra, Liechtenstein and the United States.

In [45]:
df['atm_access_2017'].fillna(df['atm_access_2017'].quantile(.25), inplace=True)
df.loc[df['Country Name'] == 'Andorra', ['atm_access_2017']] = df['atm_access_2017'].quantile(0.75)
df.loc[df['Country Name'] == 'Liechtenstein', ['atm_access_2017']] = df['atm_access_2017'].quantile(0.75)
df.loc[df['Country Name'] == 'United States', ['atm_access_2017']] = df['atm_access_2017'].quantile(0.75)

#### 15. Adolescent Fertility Rate

In [46]:
df['adol_fertility_rate_2017'].describe()

count    181.000000
mean      48.117586
std       40.590363
min        1.379000
25%       13.177000
50%       39.860000
75%       69.459000
max      186.538000
Name: adol_fertility_rate_2017, dtype: float64

In [47]:
df[df['adol_fertility_rate_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
3,Andorra,AND,11.85,102.830002,40.042553,1.744681,-4999.0,0.4,13.5,3.0,...,104.332411,3415.717495,,32.142857,,,,2.01,163.823404,0.867
45,Dominica,DMA,29.819,99.660004,33.333333,8.0,-4999.0,0.4,52.5,33.5,...,105.278625,6968.933669,,25.0,12.0,12.0,,12.5,95.28,0.738
91,St. Kitts and Nevis,KNA,69.227,94.040001,23.076923,19.230769,-4999.0,0.4,52.5,16.2,...,147.714478,3113.229303,,13.333333,18.5,18.5,,28.32,200.138462,0.773
99,Liechtenstein,LIE,85.685,99.01,32.25,13.5,-4999.0,0.4,13.125,7.2,...,122.756614,23912.180928,,12.0,,,,33.89,236.28125,0.919
110,Marshall Islands,MHL,23.366,126.669998,47.777778,11.111111,-4999.0,0.4,52.5,33.5,...,27.558648,103.353832,,9.090909,17.0,17.0,,0.72,322.516667,0.702
135,Palau,PLW,20.635,102.830002,9.347826,0.652174,-4999.0,0.4,52.5,18.6,...,,224.605537,,12.5,28.0,28.0,,5.96,38.715217,0.822


The countries above are those countries where data is unavailable. Based on our own estimations, we are going to assign all countries a value in line with median. However, the following countries will have values in line with the 25% value: Andorra and Liechtenstein

In [48]:
df['adol_fertility_rate_2017'].fillna(df['adol_fertility_rate_2017'].median(), inplace=True)
df.loc[df['Country Name'] == 'Andorra', ['adol_fertility_rate_2017']] = df['adol_fertility_rate_2017'].quantile(0.25)
df.loc[df['Country Name'] == 'Liechtenstein', ['adol_fertility_rate_2017']] = df['adol_fertility_rate_2017'].quantile(0.25)

#### 16. Female Labor Participation

In [49]:
df['fem_labor_part_rate_2017'].describe()

count    176.000000
mean      51.506790
std       15.547539
min        6.095000
25%       44.714500
50%       52.663500
75%       60.628500
max       83.930000
Name: fem_labor_part_rate_2017, dtype: float64

In [50]:
df[df['fem_labor_part_rate_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
3,Andorra,AND,11.85,102.830002,40.042553,1.744681,-4999.0,0.4,13.5,3.0,...,104.332411,3415.717495,,32.142857,,,,2.01,163.823404,0.867
7,Antigua and Barbuda,ATG,75.287,101.589996,20.454545,9.090909,0.0,0.4,42.0,7.1,...,192.819567,890.751899,,11.111111,22.0,22.0,,5.2,216.875,0.772
45,Dominica,DMA,29.819,99.660004,33.333333,8.0,-4999.0,0.4,52.5,33.5,...,105.278625,6968.933669,,25.0,12.0,12.0,,12.5,95.28,0.738
57,"Micronesia, Fed. Sts.",FSM,77.392,101.050003,31.428571,2.857143,-2999.0,0.4,88.0,27.0,...,20.73767,161.491463,,0.0,16.0,16.0,,6.58,159.23,0.618
67,Grenada,GRD,63.836,100.709999,23.529412,8.823529,-1000.0,0.4,25.0,16.6,...,102.077133,234.500424,,33.333333,15.0,15.0,,53.06,326.1,0.773
90,Kiribati,KIR,46.738,101.050003,41.975309,2.469136,-3999.0,0.4,92.0,54.5,...,40.402775,26.280518,,6.521739,31.0,31.0,,2.54,140.92963,0.628
91,St. Kitts and Nevis,KNA,69.227,94.040001,23.076923,19.230769,-4999.0,0.4,52.5,16.2,...,147.714478,3113.229303,,13.333333,18.5,18.5,,28.32,200.138462,0.773
99,Liechtenstein,LIE,85.685,99.01,32.25,13.5,-4999.0,0.4,13.125,7.2,...,122.756614,23912.180928,,12.0,,,,33.89,236.28125,0.919
110,Marshall Islands,MHL,23.366,126.669998,47.777778,11.111111,-4999.0,0.4,52.5,33.5,...,27.558648,103.353832,,9.090909,17.0,17.0,,0.72,322.516667,0.702
135,Palau,PLW,20.635,102.830002,9.347826,0.652174,-4999.0,0.4,52.5,18.6,...,,224.605537,,12.5,28.0,28.0,,5.96,38.715217,0.822


The countries above are those countries where data is unavailable. Based on our own estimations, we are going to assign all countries a value in line with median.

In [51]:
df['fem_labor_part_rate_2017'].fillna(df['fem_labor_part_rate_2017'].median(), inplace=True)

#### 17. Male Labor Participation

In [52]:
df['male_labor_part_rate_2017'].describe()

count    176.000000
mean      71.394449
std        9.121060
min       45.301998
25%       65.920498
50%       71.307499
75%       77.194248
max       95.829002
Name: male_labor_part_rate_2017, dtype: float64

In [53]:
df[df['male_labor_part_rate_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
3,Andorra,AND,11.85,102.830002,40.042553,1.744681,-4999.0,0.4,13.5,3.0,...,104.332411,3415.717495,,32.142857,,,,2.01,163.823404,0.867
7,Antigua and Barbuda,ATG,75.287,101.589996,20.454545,9.090909,0.0,0.4,42.0,7.1,...,192.819567,890.751899,,11.111111,22.0,22.0,,5.2,216.875,0.772
45,Dominica,DMA,29.819,99.660004,33.333333,8.0,-4999.0,0.4,52.5,33.5,...,105.278625,6968.933669,,25.0,12.0,12.0,,12.5,95.28,0.738
57,"Micronesia, Fed. Sts.",FSM,77.392,101.050003,31.428571,2.857143,-2999.0,0.4,88.0,27.0,...,20.73767,161.491463,,0.0,16.0,16.0,,6.58,159.23,0.618
67,Grenada,GRD,63.836,100.709999,23.529412,8.823529,-1000.0,0.4,25.0,16.6,...,102.077133,234.500424,,33.333333,15.0,15.0,,53.06,326.1,0.773
90,Kiribati,KIR,46.738,101.050003,41.975309,2.469136,-3999.0,0.4,92.0,54.5,...,40.402775,26.280518,,6.521739,31.0,31.0,,2.54,140.92963,0.628
91,St. Kitts and Nevis,KNA,69.227,94.040001,23.076923,19.230769,-4999.0,0.4,52.5,16.2,...,147.714478,3113.229303,,13.333333,18.5,18.5,,28.32,200.138462,0.773
99,Liechtenstein,LIE,85.685,99.01,32.25,13.5,-4999.0,0.4,13.125,7.2,...,122.756614,23912.180928,,12.0,,,,33.89,236.28125,0.919
110,Marshall Islands,MHL,23.366,126.669998,47.777778,11.111111,-4999.0,0.4,52.5,33.5,...,27.558648,103.353832,,9.090909,17.0,17.0,,0.72,322.516667,0.702
135,Palau,PLW,20.635,102.830002,9.347826,0.652174,-4999.0,0.4,52.5,18.6,...,,224.605537,,12.5,28.0,28.0,,5.96,38.715217,0.822


The countries above are those countries where data is unavailable. Based on our own estimations, we are going to assign all countries a value in line with median.

In [54]:
df['male_labor_part_rate_2017'].fillna(df['male_labor_part_rate_2017'].median(), inplace=True)

#### 18. Fertility Rate

In [55]:
df['fertility_rate_2017'].describe()

count    182.000000
mean       2.727431
std        1.293386
min        1.052000
25%        1.695000
50%        2.276000
75%        3.682750
max        7.001000
Name: fertility_rate_2017, dtype: float64

In [56]:
df[df['fertility_rate_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
3,Andorra,AND,11.85,102.830002,40.042553,1.744681,-4999.0,0.4,13.5,3.0,...,104.332411,3415.717495,,32.142857,,,,2.01,163.823404,0.867
45,Dominica,DMA,29.819,99.660004,33.333333,8.0,-4999.0,0.4,52.5,33.5,...,105.278625,6968.933669,,25.0,12.0,12.0,,12.5,95.28,0.738
91,St. Kitts and Nevis,KNA,69.227,94.040001,23.076923,19.230769,-4999.0,0.4,52.5,16.2,...,147.714478,3113.229303,,13.333333,18.5,18.5,,28.32,200.138462,0.773
110,Marshall Islands,MHL,23.366,126.669998,47.777778,11.111111,-4999.0,0.4,52.5,33.5,...,27.558648,103.353832,,9.090909,17.0,17.0,,0.72,322.516667,0.702
135,Palau,PLW,20.635,102.830002,9.347826,0.652174,-4999.0,0.4,52.5,18.6,...,,224.605537,,12.5,28.0,28.0,,5.96,38.715217,0.822


The countries above are those countries where data is unavailable. Based on our own estimations, we are going to assign all countries a value in line with median.

In [57]:
df['fertility_rate_2017'].fillna(df['fertility_rate_2017'].median(), inplace=True)

#### 19. Diphtheria, Tetanus, and Pertussis (DPT) Immunization Rate

In [58]:
df['dpt_immuniz_rate_2017'].describe()

count    185.000000
mean      88.000000
std       13.035904
min       36.000000
25%       84.000000
50%       92.000000
75%       97.000000
max       99.000000
Name: dpt_immuniz_rate_2017, dtype: float64

In [59]:
df[df['dpt_immuniz_rate_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
70,"Hong Kong SAR, China",HKG,0.0,98.709999,4.857143,2.952381,146542.0,0.4,13.25,7.2,...,251.765006,10482.740897,,,1.5,1.5,324.0,,7041.142857,0.946
99,Liechtenstein,LIE,85.685,99.01,32.25,13.5,-4999.0,0.4,13.125,7.2,...,122.756614,23912.180928,,12.0,,,,33.89,236.28125,0.919


The countries above are those countries where data is unavailable. Based on our own estimations, we are going to assign all countries a value in line with the 75% value.

In [60]:
df['dpt_immuniz_rate_2017'].fillna(df['dpt_immuniz_rate_2017'].quantile(0.75), inplace=True)

#### 20. Undernourishment Rate

In [61]:
df['undernourished_rate_2017'].describe()

count    154.000000
mean       9.501299
std       10.354520
min        2.500000
25%        2.500000
50%        5.600000
75%       11.875000
max       48.000000
Name: undernourished_rate_2017, dtype: float64

In [62]:
df[df['undernourished_rate_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
3,Andorra,AND,11.85,102.830002,40.042553,1.744681,-4999.0,0.4,13.5,3.0,...,104.332411,3415.717495,,32.142857,,,,2.01,163.823404,0.867
7,Antigua and Barbuda,ATG,75.287,101.589996,20.454545,9.090909,0.0,0.4,42.0,7.1,...,192.819567,890.751899,,11.111111,22.0,22.0,,5.2,216.875,0.772
11,Burundi,BDI,87.294,111.919998,79.166667,46.728972,10003.0,1.2,548.0,61.5,...,54.683676,4.61808,1.870652,36.363636,4.0,4.0,3.0,13.19,421.612539,0.431
17,Bahrain,BHR,10.814,122.360001,11.025641,2.051282,239000.0,0.4,14.0,7.3,...,158.256809,359.419227,4.338775,7.5,8.0,9.0,8.0,259.34,1915.483333,0.852
18,"Bahamas, The",BHS,17.075,100.32,1.398601,0.799201,4999.0,1.3,70.0,13.3,...,92.609134,1508.844817,,12.820513,21.5,21.5,4.0,20.4,38.136763,0.812
26,Bhutan,BTN,59.833,105.489998,13.449035,2.477454,1600.0,0.2,183.0,30.7,...,97.996145,118.031608,,8.510638,12.0,12.0,,44.3,19.54601,0.649
38,Comoros,COM,71.216,99.139999,70.392262,35.464804,-10000.0,0.1,273.0,67.9,...,57.613787,4.914669,,6.060606,16.0,16.0,,3.84,437.34014,0.552
50,Eritrea,ERI,60.42325,101.370003,75.168317,6.831683,-199290.0,0.6,480.0,43.3,...,20.363955,0.0,,22.0,84.0,84.0,,13.65,33.79103,0.456
57,"Micronesia, Fed. Sts.",FSM,77.392,101.050003,31.428571,2.857143,-2999.0,0.4,88.0,27.0,...,20.73767,161.491463,,0.0,16.0,16.0,,6.58,159.23,0.618
62,Guinea,GIN,64.207,110.610001,59.010256,12.615986,-20000.0,1.5,576.0,102.6,...,96.954478,1.574475,1.768123,21.929825,15.0,15.0,,38.02,49.110842,0.473


The countries above are those countries where data is unavailable. Based on our own estimations, we are going to assign the values to the countries laid out below:
- 25% Value:
    - Andorra
    - Liechtenstein
- 75% Value:
    - Zimbabwe
    - Zambia
    - Burundi
    - Eritrea
    - Guinea
    - Guinea-Bissau
    - South Sudan
    - Libya
- Median Value:
    - All Others

In [63]:
df['undernourished_rate_2017'].fillna(df['undernourished_rate_2017'].median(), inplace=True)
df.loc[df['Country Name'] == 'Andorra', ['undernourished_rate_2017']] = df['undernourished_rate_2017'].quantile(0.25)
df.loc[df['Country Name'] == 'Liechtenstein', ['undernourished_rate_2017']] = df['undernourished_rate_2017'].quantile(0.25)
df.loc[df['Country Name'] == 'Zimbabwe', ['undernourished_rate_2017']] = df['undernourished_rate_2017'].quantile(0.75)
df.loc[df['Country Name'] == 'Zambia', ['undernourished_rate_2017']] = df['undernourished_rate_2017'].quantile(0.75)
df.loc[df['Country Name'] == 'Burundi', ['undernourished_rate_2017']] = df['undernourished_rate_2017'].quantile(0.75)
df.loc[df['Country Name'] == 'Eritrea', ['undernourished_rate_2017']] = df['undernourished_rate_2017'].quantile(0.75)
df.loc[df['Country Name'] == 'Guinea', ['undernourished_rate_2017']] = df['undernourished_rate_2017'].quantile(0.75)
df.loc[df['Country Name'] == 'Guinea-Bissau', ['undernourished_rate_2017']] = df['undernourished_rate_2017'].quantile(0.75)
df.loc[df['Country Name'] == 'South Sudan', ['undernourished_rate_2017']] = df['undernourished_rate_2017'].quantile(0.75)
df.loc[df['Country Name'] == 'Libya', ['undernourished_rate_2017']] = df['undernourished_rate_2017'].quantile(0.75)

#### 21. Cell Subscription Rate

In [64]:
df['cell_subscriptions_per100'].describe()

count    186.000000
mean     107.016056
std       37.756422
min       20.363955
25%       84.448136
50%      109.761144
75%      129.679498
max      251.765006
Name: cell_subscriptions_per100, dtype: float64

In [65]:
df[df['cell_subscriptions_per100'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
135,Palau,PLW,20.635,102.830002,9.347826,0.652174,-4999.0,0.4,52.5,18.6,...,,224.605537,,12.5,28.0,28.0,,5.96,38.715217,0.822


Palau is the only country where data is unavailable. Based on geographic peers and our own estimations, we are going to assign Palau a value in line with the 25% value.

In [66]:
df.loc[df['Country Name'] == 'Palau', ['cell_subscriptions_per100']] = df['cell_subscriptions_per100'].quantile(0.25)

#### 22. Military Expenditure as a % of GDP

In [67]:
df['military_exp_2017'].describe()

count    150.000000
mean       1.843933
std        1.476371
min        0.000000
25%        0.983364
50%        1.430273
75%        2.138151
max       10.223848
Name: military_exp_2017, dtype: float64

In [68]:
df[df['military_exp_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
3,Andorra,AND,11.85,102.830002,40.042553,1.744681,-4999.0,0.4,13.5,3.0,...,104.332411,3415.717495,,32.142857,,,,2.01,163.823404,0.867
4,United Arab Emirates,ARE,13.752,104.18,5.404112,0.626584,200000.0,0.1,3.0,7.2,...,208.9786,1285.099111,,22.5,8.0,9.0,63.0,2899.67,133.58499,0.889
7,Antigua and Barbuda,ATG,75.287,101.589996,20.454545,9.090909,0.0,0.4,42.0,7.1,...,192.819567,890.751899,,11.111111,22.0,22.0,,5.2,216.875,0.772
18,"Bahamas, The",BHS,17.075,100.32,1.398601,0.799201,4999.0,1.3,70.0,13.3,...,92.609134,1508.844817,,12.820513,21.5,21.5,4.0,20.4,38.136763,0.812
24,Barbados,BRB,68.841,103.260002,23.255814,16.27907,-397.0,0.4,27.0,13.4,...,115.139118,768.615339,,16.666667,16.0,16.0,,48.84,665.648837,0.81
26,Bhutan,BTN,59.833,105.489998,13.449035,2.477454,1600.0,0.2,183.0,30.7,...,97.996145,118.031608,,8.510638,12.0,12.0,,44.3,19.54601,0.649
38,Comoros,COM,71.216,99.139999,70.392262,35.464804,-10000.0,0.1,273.0,67.9,...,57.613787,4.914669,,6.060606,16.0,16.0,,3.84,437.34014,0.552
44,Djibouti,DJI,22.352,122.849998,73.425367,0.086281,4501.0,1.0,248.0,61.8,...,39.514076,22.243406,,10.769231,14.0,14.0,,5.13,40.729077,0.518
45,Dominica,DMA,29.819,99.660004,33.333333,8.0,-4999.0,0.4,52.5,33.5,...,105.278625,6968.933669,,25.0,12.0,12.0,,12.5,95.28,0.738
50,Eritrea,ERI,60.42325,101.370003,75.168317,6.831683,-199290.0,0.6,480.0,43.3,...,20.363955,0.0,,22.0,84.0,84.0,,13.65,33.79103,0.456


The countries above are those countries where data is unavailable. Based on our own estimations, we are going to assign all countries a value in line with the median.

In [69]:
df['military_exp_2017'].fillna(df['military_exp_2017'].median(), inplace=True)

#### 23. Proportion of seats held by women in National Parliaments

In [70]:
df['women_seats_percent_2017'].describe()

count    182.000000
mean      21.489788
std       11.827659
min        0.000000
25%       12.073286
50%       20.101010
75%       30.500960
max       61.250000
Name: women_seats_percent_2017, dtype: float64

In [71]:
df[df['women_seats_percent_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
31,Chile,CHL,12.51,99.110001,21.192094,1.724203,558539.0,0.5,13.0,7.4,...,124.594478,7258.193973,1.935829,,7.5,7.5,425.0,6791.32,24.84148,0.849
65,Equatorial Guinea,GNQ,28.354,106.150002,10.124777,4.278075,79998.0,7.1,301.0,86.9,...,44.858328,0.0,1.100369,,33.0,33.0,,3.33,44.991373,0.582
70,"Hong Kong SAR, China",HKG,0.0,98.709999,4.857143,2.952381,146542.0,0.4,13.25,7.2,...,251.765006,10482.740897,1.430273,,1.5,1.5,324.0,,7041.142857,0.946
71,Honduras,HND,43.543,105.900002,29.94012,9.116096,-34000.0,0.3,65.0,18.0,...,87.3209,79.541704,1.724405,,42.0,42.0,4.0,43.62,84.270408,0.633
140,West Bank and Gaza,PSE,24.106,105.870003,22.643333,4.129606,-52816.0,0.4,27.0,18.1,...,84.179796,547.723189,1.430273,,43.0,44.0,,324.96,740.000831,0.708


The countries above are those countries where data is unavailable. Based on our own estimations, we are going to assign all countries a value in line with the median.

In [72]:
df['women_seats_percent_2017'].fillna(df['women_seats_percent_2017'].median(), inplace=True)

#### 24. Male-Owned Business Starts

In [73]:
df['male_bus_start_2017'].describe()

count    183.000000
mean      21.510137
std       26.361012
min        0.500000
25%        8.000000
50%       13.000000
75%       24.950000
max      230.000000
Name: male_bus_start_2017, dtype: float64

In [74]:
df[df['male_bus_start_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
3,Andorra,AND,11.85,102.830002,40.042553,1.744681,-4999.0,0.4,13.5,3.0,...,104.332411,3415.717495,1.430273,32.142857,,,,2.01,163.823404,0.867
40,Cuba,CUB,23.023,103.900002,60.693642,28.021195,-72000.0,0.4,36.0,5.4,...,40.688585,8.466165,2.872403,48.856209,,,29.0,1117.6,109.241378,0.781
99,Liechtenstein,LIE,85.685,99.01,32.25,13.5,-4999.0,0.4,13.125,7.2,...,122.756614,23912.180928,1.430273,12.0,,,,33.89,236.28125,0.919
166,Turkmenistan,TKM,48.847,96.129997,72.006469,4.128274,-25001.0,0.4,7.0,42.9,...,162.861103,4.515718,1.430273,25.806452,,,,7.93,12.25218,0.71


The countries above are those countries where data is unavailable. Based on our own estimations, we are going to assign all countries a value in line with the median.

In [75]:
df['male_bus_start_2017'].fillna(df['male_bus_start_2017'].median(), inplace=True)

#### 24. Female-Owned Business Starts

In [76]:
df['female_bus_start_2017'].describe()

count    183.000000
mean      21.630355
std       26.363504
min        0.500000
25%        8.000000
50%       13.000000
75%       24.950000
max      230.000000
Name: female_bus_start_2017, dtype: float64

In [77]:
df[df['female_bus_start_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
3,Andorra,AND,11.85,102.830002,40.042553,1.744681,-4999.0,0.4,13.5,3.0,...,104.332411,3415.717495,1.430273,32.142857,13.0,,,2.01,163.823404,0.867
40,Cuba,CUB,23.023,103.900002,60.693642,28.021195,-72000.0,0.4,36.0,5.4,...,40.688585,8.466165,2.872403,48.856209,13.0,,29.0,1117.6,109.241378,0.781
99,Liechtenstein,LIE,85.685,99.01,32.25,13.5,-4999.0,0.4,13.125,7.2,...,122.756614,23912.180928,1.430273,12.0,13.0,,,33.89,236.28125,0.919
166,Turkmenistan,TKM,48.847,96.129997,72.006469,4.128274,-25001.0,0.4,7.0,42.9,...,162.861103,4.515718,1.430273,25.806452,13.0,,,7.93,12.25218,0.71


The countries above are those countries where data is unavailable. Based on our own estimations, we are going to assign all countries a value in line with the median.

In [78]:
df['female_bus_start_2017'].fillna(df['female_bus_start_2017'].median(), inplace=True)

#### 25. Patent Applications

In [79]:
df['patent_apps_2017'].describe()

count    1.150000e+02
mean     1.879897e+04
std      1.219180e+05
min      1.000000e+00
25%      2.350000e+01
50%      1.720000e+02
75%      1.159000e+03
max      1.245709e+06
Name: patent_apps_2017, dtype: float64

In [80]:
df[df['patent_apps_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
0,Afghanistan,AFG,74.750,99.169998,58.067580,11.792727,-314602.0,0.1,638.0,64.8,...,65.929134,43.448181,0.945227,27.710843,8.0,9.0,,91.89,55.595550,0.509
1,Angola,AGO,35.161,103.419998,45.237371,3.930376,32066.0,1.9,241.0,80.6,...,44.686107,10.799292,2.507985,30.454545,36.0,36.0,,55.50,23.916555,0.582
3,Andorra,AND,11.850,102.830002,40.042553,1.744681,-4999.0,0.4,13.5,3.0,...,104.332411,3415.717495,1.430273,32.142857,13.0,13.0,,2.01,163.823404,0.867
7,Antigua and Barbuda,ATG,75.287,101.589996,20.454545,9.090909,0.0,0.4,42.0,7.1,...,192.819567,890.751899,1.430273,11.111111,22.0,22.0,,5.20,216.875000,0.772
13,Benin,BEN,53.232,105.330002,35.030153,24.831501,-10000.0,1.0,397.0,93.2,...,78.504596,4.205744,0.920613,7.228916,8.0,9.0,,209.42,99.105995,0.541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,Tanzania,TZA,66.947,96.970001,44.761797,15.240461,-200381.0,5.0,524.0,54.1,...,73.094790,22.703845,1.055796,37.179487,29.0,29.0,,610.37,61.707321,0.524
178,St. Vincent and the Grenadines,VCT,48.216,100.930000,17.948718,5.128205,-1000.0,0.4,68.0,15.6,...,105.478616,163.895617,1.430273,13.043478,10.0,10.0,,3.71,281.605128,0.736
181,Vanuatu,VUT,74.837,98.970001,15.340443,1.640689,600.0,0.4,72.0,26.9,...,79.862702,178.634601,1.430273,0.000000,18.0,18.0,,10.46,23.420755,0.603
182,Samoa,WSM,81.548,102.620003,26.749117,11.484099,-14013.0,0.4,43.0,18.1,...,63.583173,296.890836,1.430273,10.000000,9.0,9.0,,10.22,69.031095,0.709


The countries above are those countries where data is unavailable. Based on our own estimations, we are going to assign all countries a value in line with the 25% value.

In [81]:
df['patent_apps_2017'].fillna(df['patent_apps_2017'].quantile(.25), inplace=True)

#### 26. Scientific Articles

In [82]:
df['sci_articles_2017'].describe()

count       186.000000
mean      13018.392151
std       50321.111313
min           0.720000
25%          53.310000
50%         354.370000
75%        5640.742500
max      473438.510000
Name: sci_articles_2017, dtype: float64

In [83]:
df[df['sci_articles_2017'].isna()]

Unnamed: 0,Country Name,Country Code,rural_pop_percent_17,food_production_index_17,ag_land_area_17,arable_land_percent_2017,net_migration_2017,hiv_prevalence_2017,mat_mortality_ratio_2017,under5_mortality_ratio_2017,...,cell_subscriptions_per100,internet_per_mil_2017,military_exp_2017,women_seats_percent_2017,male_bus_start_2017,female_bus_start_2017,patent_apps_2017,sci_articles_2017,pop_density_2017,HDI
70,"Hong Kong SAR, China",HKG,0.0,98.709999,4.857143,2.952381,146542.0,0.4,13.25,7.2,...,251.765006,10482.740897,1.430273,20.10101,1.5,1.5,324.0,,7041.142857,0.946


Hong Kong is the only country where data is unavailable. Based on our own estimations, we are going to assign Hong Kong a value in line with the 75% value.

In [84]:
df['sci_articles_2017'].fillna(df['sci_articles_2017'].quantile(.75), inplace=True)

### Check for Nulls

In [85]:
df.isnull().sum()

Country Name                   0
Country Code                   0
rural_pop_percent_17           0
food_production_index_17       0
ag_land_area_17                0
arable_land_percent_2017       0
net_migration_2017             0
hiv_prevalence_2017            0
mat_mortality_ratio_2017       0
under5_mortality_ratio_2017    0
tubercul_incidence_2017        0
elec_access_2017               0
ren_energy_percent_2017        0
co2_emissions_2017             0
pop_air_pollution_2017         0
foreign_dir_inv_2017           0
atm_access_2017                0
adol_fertility_rate_2017       0
fem_labor_part_rate_2017       0
male_labor_part_rate_2017      0
fertility_rate_2017            0
dpt_immuniz_rate_2017          0
undernourished_rate_2017       0
cell_subscriptions_per100      0
internet_per_mil_2017          0
military_exp_2017              0
women_seats_percent_2017       0
male_bus_start_2017            0
female_bus_start_2017          0
patent_apps_2017               0
sci_articl

### Save cleaned DataFrame

In [86]:
df.to_csv('../datasets/cleaned_data.csv', index=False)