In [1]:
import pandas as pd

## Do the ETL process on the WHO data
1. Get the World Happiness for the ML model process

In [2]:
# Load the excel file
who_df = pd.read_excel("Resources/World health statistics 2024.xlsx")
who_df.head()

Unnamed: 0,IND_NAME,DIM_GEO_NAME,IND_CODE,DIM_GEO_CODE,DIM_TIME_YEAR,DIM_1_CODE,VALUE_NUMERIC,VALUE_STRING,VALUE_COMMENTS
0,Adolescent birth rate (per 1000 women),Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS15-19,62.0,62.0,Afghanistan 2022-2023 Multiple Indicator Clust...
1,Adolescent birth rate (per 1000 women),Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS10-14,18.0,18.0,Afghanistan 2022-2023 Multiple Indicator Clust...
2,Age-standardized mortality rate attributed to ...,Afghanistan,SDGAIRBODA,AFG,2019,SEX_BTSX,265.66452,265.7,
3,Age-standardized prevalence of hypertension am...,Afghanistan,NCD_HYP_PREVALENCE_A,AFG,2019,SEX_BTSX,40.200001,40.2,
4,Age-standardized prevalence of obesity among a...,Afghanistan,NCD_BMI_30A,AFG,2022,SEX_BTSX,19.222589,19.2,


In [3]:
# Drop unneeded columns VALUE_STRING and VALUE_COMMENTS
who_df = who_df.drop(columns=["VALUE_STRING", "VALUE_COMMENTS"])
who_df.head()

Unnamed: 0,IND_NAME,DIM_GEO_NAME,IND_CODE,DIM_GEO_CODE,DIM_TIME_YEAR,DIM_1_CODE,VALUE_NUMERIC
0,Adolescent birth rate (per 1000 women),Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS15-19,62.0
1,Adolescent birth rate (per 1000 women),Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS10-14,18.0
2,Age-standardized mortality rate attributed to ...,Afghanistan,SDGAIRBODA,AFG,2019,SEX_BTSX,265.66452
3,Age-standardized prevalence of hypertension am...,Afghanistan,NCD_HYP_PREVALENCE_A,AFG,2019,SEX_BTSX,40.200001
4,Age-standardized prevalence of obesity among a...,Afghanistan,NCD_BMI_30A,AFG,2022,SEX_BTSX,19.222589


In [4]:
# For ML Drop IND_NAME column and the questions are codified in IND_CODE column
who_ml_df = who_df.drop(columns=["IND_NAME"])
who_ml_df.head()

Unnamed: 0,DIM_GEO_NAME,IND_CODE,DIM_GEO_CODE,DIM_TIME_YEAR,DIM_1_CODE,VALUE_NUMERIC
0,Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS15-19,62.0
1,Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS10-14,18.0
2,Afghanistan,SDGAIRBODA,AFG,2019,SEX_BTSX,265.66452
3,Afghanistan,NCD_HYP_PREVALENCE_A,AFG,2019,SEX_BTSX,40.200001
4,Afghanistan,NCD_BMI_30A,AFG,2022,SEX_BTSX,19.222589


In [5]:
# IND_CODE = ["WHOSIS_0001", "WHOSIS_0002"] are the targets. This part of the df needs to be pivoted
# Filter the DataFrame for specific IND_CODE values
filtered_df = who_ml_df.loc[(who_ml_df["IND_CODE"] == "WHOSIS_0001") | (who_ml_df["IND_CODE"] == "WHOSIS_0002"),:]
filtered_df.head()

Unnamed: 0,DIM_GEO_NAME,IND_CODE,DIM_GEO_CODE,DIM_TIME_YEAR,DIM_1_CODE,VALUE_NUMERIC
15,Afghanistan,WHOSIS_0002,AFG,2021,SEX_FMLE,51.312912
16,Afghanistan,WHOSIS_0002,AFG,2021,SEX_MLE,49.64695
17,Afghanistan,WHOSIS_0002,AFG,2021,SEX_BTSX,50.446243
19,Afghanistan,WHOSIS_0001,AFG,2021,SEX_BTSX,59.126904
20,Afghanistan,WHOSIS_0001,AFG,2021,SEX_FMLE,60.986038


In [6]:
filtered_df["DIM_1_CODE"].unique()

array(['SEX_FMLE', 'SEX_MLE', 'SEX_BTSX'], dtype=object)

In [7]:
# Aggregate values by avg of SEX_MLE, SEX_FMLE and SEX_BTSX
aggregated_df = filtered_df.groupby(["DIM_GEO_NAME", "DIM_TIME_YEAR", "IND_CODE"], as_index=False).agg({'VALUE_NUMERIC': 'mean'})
aggregated_df.head()

Unnamed: 0,DIM_GEO_NAME,DIM_TIME_YEAR,IND_CODE,VALUE_NUMERIC
0,Afghanistan,2021,WHOSIS_0001,59.172241
1,Afghanistan,2021,WHOSIS_0002,50.468702
2,African Region,2021,WHOSIS_0001,63.555041
3,African Region,2021,WHOSIS_0002,55.157518
4,Albania,2021,WHOSIS_0001,76.454702


In [8]:
# Pivot the DataFrame
pivot_df = aggregated_df.pivot_table(index=["DIM_GEO_NAME", "DIM_TIME_YEAR"], 
                                      columns="IND_CODE", 
                                      values="VALUE_NUMERIC")

# Reset index
pivot_df.reset_index(inplace=True)

pivot_df.head()

IND_CODE,DIM_GEO_NAME,DIM_TIME_YEAR,WHOSIS_0001,WHOSIS_0002
0,Afghanistan,2021,59.172241,50.468702
1,African Region,2021,63.555041,55.157518
2,Albania,2021,76.454702,66.72742
3,Algeria,2021,76.040944,65.572347
4,Angola,2021,62.131477,53.765409


In [9]:
# For ML Drop rows with IND_CODE = ["WHOSIS_0001", "WHOSIS_0002"]
who_ml_df = who_ml_df.loc[(who_ml_df["IND_CODE"] != "WHOSIS_0001") & (who_ml_df["IND_CODE"] != "WHOSIS_0002"),:]

who_ml_df.head()

Unnamed: 0,DIM_GEO_NAME,IND_CODE,DIM_GEO_CODE,DIM_TIME_YEAR,DIM_1_CODE,VALUE_NUMERIC
0,Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS15-19,62.0
1,Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS10-14,18.0
2,Afghanistan,SDGAIRBODA,AFG,2019,SEX_BTSX,265.66452
3,Afghanistan,NCD_HYP_PREVALENCE_A,AFG,2019,SEX_BTSX,40.200001
4,Afghanistan,NCD_BMI_30A,AFG,2022,SEX_BTSX,19.222589


In [10]:
who_ml_df["IND_CODE"].unique()

array(['MDG_0000000003', 'SDGAIRBODA', 'NCD_HYP_PREVALENCE_A',
       'NCD_BMI_30A', 'M_Est_tob_curr_std', 'SDGODAWS', 'SDGPM25',
       'SDGIHR2021', 'HWF_0010', 'HWF_0001', 'HWF_0006', 'HWF_0014',
       'WHS4_100', 'GHED_GGHE-DGGE_SHA2011', 'SDGHEPHBSAGPRV',
       'MALARIA_EST_INCIDENCE', 'MDG_0000000026', 'MCV2', 'SDGWSHBOD',
       'VIOLENCE_HOMICIDERATE', 'SDGPOISON', 'WHOSIS_000003', 'SDGHIV',
       'VACCINEPREVENTABLE_WILDPOLIO', 'PCV3',
       'FINPROTECTION_CATA_TOT_10_POP', 'FINPROTECTION_CATA_TOT_25_POP',
       'NUTRITION_ANAEMIA_REPRODUCTIVEAGE_PREV', 'NCD_BMI_PLUS2C',
       'NUTOVERWEIGHTPREV', 'NUTSTUNTINGPREV', 'NUTRITION_WH_2',
       'NCDMORT3070', 'MDG_0000000025', 'SDGIPV12M', 'SDGIPVLT',
       'WSH_HYGIENE_BASIC', 'WSH_WATER_SAFELY_MANAGED',
       'PHE_HHAIR_PROP_POP_CLEAN_FUELS', 'SDGFPALL', 'SDGNTDTREATMENT',
       'RS_198', 'SDGSUICIDE', 'SA_0000001688', 'SDGODA01',
       'MDG_0000000020', 'UHC_INDEX_REPORTED', 'MDG_0000000007',
       'SDGHPVRECEIVED', 

In [11]:
len(who_ml_df)

9351

In [12]:
who_ml_merged_df = pd.merge(who_ml_df, pivot_df, on=["DIM_GEO_NAME", "DIM_TIME_YEAR"])
who_ml_merged_df.head()

Unnamed: 0,DIM_GEO_NAME,IND_CODE,DIM_GEO_CODE,DIM_TIME_YEAR,DIM_1_CODE,VALUE_NUMERIC,WHOSIS_0001,WHOSIS_0002
0,Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS15-19,62.0,59.172241,50.468702
1,Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS10-14,18.0,59.172241,50.468702
2,Afghanistan,GHED_GGHE-DGGE_SHA2011,AFG,2021,,4.05,59.172241,50.468702
3,Afghanistan,VIOLENCE_HOMICIDERATE,AFG,2021,,8.37823,59.172241,50.468702
4,Afghanistan,SDGPOISON,AFG,2021,,2.503224,59.172241,50.468702


In [13]:
len(who_ml_merged_df)

# The Healthy life expectancy at birth (years) and Life expectancy at birth (years) metrics, which based on the World health statistics 2024 report
# are the targets, are metric only avaiable o the data set for 2021. We will merge the life expectancy at birth (years) column from the World Happiness
# Report as the target of this WHO dataset and removed the rows with IND_CODE = ["WHOSIS_0001", "WHOSIS_0002"]

1726

In [14]:
# df to be used:
who_ml_df.head()

Unnamed: 0,DIM_GEO_NAME,IND_CODE,DIM_GEO_CODE,DIM_TIME_YEAR,DIM_1_CODE,VALUE_NUMERIC
0,Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS15-19,62.0
1,Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS10-14,18.0
2,Afghanistan,SDGAIRBODA,AFG,2019,SEX_BTSX,265.66452
3,Afghanistan,NCD_HYP_PREVALENCE_A,AFG,2019,SEX_BTSX,40.200001
4,Afghanistan,NCD_BMI_30A,AFG,2022,SEX_BTSX,19.222589


## Merge the WHO df with the Country - Regions file
Instead of creating dummie columns for the countries, uso the ISO code for country, region, and intermediate region

In [15]:
# Load the cvs file for Country Regions
country_regions_df = pd.read_csv("Resources/Countries and regions.csv")
country_regions_df.head()

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


In [16]:
# Keep the needed columns
country_regions_df = country_regions_df.drop(columns=["alpha-2", "country-code", "iso_3166-2", "intermediate-region", "region-code", "sub-region-code", "intermediate-region-code"])
country_regions_df.head()

Unnamed: 0,name,alpha-3,region,sub-region
0,Afghanistan,AFG,Asia,Southern Asia
1,Åland Islands,ALA,Europe,Northern Europe
2,Albania,ALB,Europe,Southern Europe
3,Algeria,DZA,Africa,Northern Africa
4,American Samoa,ASM,Oceania,Polynesia


In [17]:
# Check all countries are in both files => Potencial typo differences
who_countries = who_ml_df["DIM_GEO_CODE"].unique()
who_countries

array(['AFG', 'AFR', 'ALB', 'DZA', 'AND', 'AGO', 'ATG', 'ARG', 'ARM',
       'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL',
       'BLZ', 'BEN', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'BRN', 'BGR',
       'BFA', 'BDI', 'CPV', 'KHM', 'CMR', 'CAN', 'CAF', 'TCD', 'CHL',
       'CHN', 'COL', 'COM', 'COG', 'COK', 'CRI', 'CIV', 'HRV', 'CUB',
       'CYP', 'CZE', 'PRK', 'COD', 'DNK', 'DJI', 'DMA', 'DOM', 'EMR',
       'ECU', 'EGY', 'SLV', 'GNQ', 'ERI', 'EST', 'SWZ', 'ETH', 'EUR',
       'FJI', 'FIN', 'FRA', 'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GLOBAL',
       'GRC', 'GRD', 'GTM', 'GIN', 'GNB', 'GUY', 'HTI', 'HND', 'HUN',
       'ISL', 'IND', 'IDN', 'IRN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM',
       'JPN', 'JOR', 'KAZ', 'KEN', 'KIR', 'KWT', 'KGZ', 'LAO', 'LVA',
       'LBN', 'LSO', 'LBR', 'LBY', 'LTU', 'LUX', 'MDG', 'MWI', 'MYS',
       'MDV', 'MLI', 'MLT', 'MHL', 'MRT', 'MUS', 'MEX', 'FSM', 'MCO',
       'MNG', 'MNE', 'MAR', 'MOZ', 'MMR', 'NAM', 'NRU', 'NPL', 'NLD',
       'NZL', 'NI

In [18]:
# Create a test df of DIM_GEO_CODE and label it as the same column of the country_regions_df
who_countries_df = pd.DataFrame({"alpha-3": who_countries})
who_countries_df.head()

Unnamed: 0,alpha-3
0,AFG
1,AFR
2,ALB
3,DZA
4,AND


In [19]:
# Print the amout of records in each df
print(f'Unique countries in the WHO df: {len(who_countries_df)}')
print(f'Unique countries in the Country_Region df: {len(country_regions_df)}')

Unique countries in the WHO df: 203
Unique countries in the Country_Region df: 249


In [20]:
# Merge the dfs based on the who_countries_df
merge_countries_df = pd.merge(who_countries_df, country_regions_df, on="alpha-3", how="left")
merge_countries_df

Unnamed: 0,alpha-3,name,region,sub-region
0,AFG,Afghanistan,Asia,Southern Asia
1,AFR,,,
2,ALB,Albania,Europe,Southern Europe
3,DZA,Algeria,Africa,Northern Africa
4,AND,Andorra,Europe,Southern Europe
...,...,...,...,...
198,VNM,Vietnam,Asia,South-eastern Asia
199,WPR,,,
200,YEM,Yemen,Asia,Western Asia
201,ZMB,Zambia,Africa,Sub-Saharan Africa


In [21]:
# Identify the NaN on the merge df and define if there is a way to work with them...
merge_countries_df.loc[merge_countries_df["name"].isna(), :]

Unnamed: 0,alpha-3,name,region,sub-region
1,AFR,,,
53,EMR,,,
62,EUR,,,
71,GLOBAL,,,
146,AMR,,,
170,SEAR,,,
199,WPR,,,


## AFR
AFR is a region, not a country => Algeria, Angola, Benin, Botswana, Burkina Faso, Burundi, Cameroon, Cape Verde, Central African Republic, Chad, Comoros, Ivory Coast, Democratic Republic of the Congo, Equatorial Guinea, Eritrea, Ethiopia, Gabon, Gambia, Ghana, Guinea, Guinea-Bissau, Kenya, Lesotho, Liberia, Madagascar, Malawi, Mali, Mauritania, Mauritius, Mozambique, Namibia, Niger, Nigeria, Republic of the Congo, Rwanda, São Tomé and Príncipe, Senegal, Seychelles, Sierra Leone, South Africa, South Sudan, Eswatini, Togo, Uganda, Tanzania, Zambia, Zimbabwe.

- Region: Africa
- Sub Region: Sub-Saharan Africa

## EMR
Eastern Mediterranean Region does not have a single dedicated ISO code as it is a geographical region encompassing multiple countries, each with their own ISO code.

Afghanistan, Bahrain, Djibouti, Egypt, Iran, Iraq, Jordan, Kuwait, Lebanon, Libya, Morocco, Oman, Pakistan, Qatar, Saudi Arabia, Somalia, Sudan, Syria, Tunisia, United Arab Emirates, Yemen.

Afghanistan, Iran, Pakistan
- Region: Asia
- Sub Region: Southern Asia

Bahrain, Iraq, Jordan, Kuwait, Lebanon, Oman, Qatar, Saudi Arabia, Syrian Arab Republic, United Arab Emirates, Yemen
- Region: Asia
- Sub Region: Western Asia

Egypt, Libya, Morocco, Sudan, Tunisia
- Region: Africa
- Sub Region: Northern Africa

Djibouti, Somalia
- Region: Africa
- Sub Region: Eastern Africa


## EUR
EUR is a region, not a country => Albania, Andorra, Armenia, Austria, Azerbaijan, Belarus, Belgium, Bosnia and Herzegovina, Bulgaria, Croatia, Cyprus, Czech Republic, Denmark, Estonia, Finland, France, Georgia, Germany, Greece, Hungary, Iceland, Ireland, Israel, Italy, Kazakhstan, Kyrgyzstan, Latvia, Lithuania, Luxembourg, Malta, Moldova, Monaco, Montenegro, Netherlands, North Macedonia, Norway, Poland, Portugal, Romania, Russia, San Marino, Serbia, Slovakia, Slovenia, Spain, Sweden, Switzerland, Tajikistan, Turkey, Turkmenistan, Ukraine, United Kingdom, Uzbekistan.

- Region: Europe
- Sub Region: Northern Europe / Southern Europe / Eastern Europe / Western Europe

## AMR
AMR is a region, not a country => Antigua and Barbuda, Argentina, Bahamas, Barbados, Belize, Bolivia, Brazil, Canada, Chile, Colombia, Costa Rica, Cuba, Dominica, Dominican Republic, Ecuador, El Salvador, Grenada, Guatemala, Guyana, Haiti, Honduras, Jamaica, Mexico, Nicaragua, Panama, Paraguay, Peru, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Suriname, Trinidad and Tobago, United States, Uruguay, Venezuela

- Region: Americas
- Sub Region: Latin America and the Caribbean / Northern America

## SEAR
SEAR is a region, not a country => Bangladesh, Bhutan, Democratic People's Republic of Korea, India, Indonesia, Maldives, Myanmar, Nepal, Sri Lanka, cambodia, Thailand, Timor-Leste.

- Region: Asia
- Sub Region: Southern Asia / South-eastern Asia / Eastern Asia

## WPR
WPR is a region, not a country => Australia, Brunei, China, Cook Islands, Fiji, Japan, Kiribati, Laos, Malaysia, Marshall Islands, Micronesia, Mongolia, Nauru, New Zealand, Niue, Palau, Papua New Guinea, Philippines, Samoa, Singapore, Solomon Islands, South Korea, Taiwan, Tonga, Tuvalu, Vanuatu, Vietnam.

Australia, New Zealand
- Region: Oceania
- Sub Region: Australia and New Zealand

Fiji, Papua New Guinea, Solomon Islands, Vanuatu
- Region: Oceania
- Sub Region: Melanesia

Kiribati, Marshall Islands, Micronesia Federated States of, Nauru, Palau
- Region: Oceania
- Sub Region: Micronesia

Cook Islands, Niue, Samoa, Tonga, Tuvalu
- Region: Oceania
- Sub Region: Polynesia

China, Japan, South Korea (KR), Mongolia
- Region: Asia
- Sub Region: Eastern Asia

Brunei Darussalam, Lao People's Democratic Republic, Malaysia, Philippines, Singapore, Viet Nam
- Region: Asia
- Sub Region: South-eastern Asia

## GLOBAL
When a WHO report uses the term "GLOBAL," it refers to data or information encompassing the entire world, meaning it aggregates health statistics and trends across all countries monitored by the organization, providing a comprehensive picture of health issues on a worldwide scale => This rows will need to be dropped

In [22]:
# Aggregate values by avg of DIM_1_CODE
who_aggregated_df = who_ml_df.groupby(["DIM_GEO_NAME", "DIM_GEO_CODE", "DIM_TIME_YEAR", "IND_CODE"], as_index=False).agg({'VALUE_NUMERIC': 'mean'})
who_aggregated_df

Unnamed: 0,DIM_GEO_NAME,DIM_GEO_CODE,DIM_TIME_YEAR,IND_CODE,VALUE_NUMERIC
0,Afghanistan,AFG,2016,SDGFPALL,42.099998
1,Afghanistan,AFG,2018,HWF_0006,4.520000
2,Afghanistan,AFG,2018,HWF_0014,0.292000
3,Afghanistan,AFG,2018,SDGIPV12M,35.000000
4,Afghanistan,AFG,2018,SDGIPVLT,46.000000
...,...,...,...,...,...
9201,"occupied Palestinian territory, including east...",PSE,2022,WHOSIS_000003,8.928316
9202,"occupied Palestinian territory, including east...",PSE,2022,WHS4_100,98.000000
9203,"occupied Palestinian territory, including east...",PSE,2022,WSH_HYGIENE_BASIC,94.917770
9204,"occupied Palestinian territory, including east...",PSE,2022,WSH_SANITATION_SAFELY_MANAGED,70.108719


In [23]:
# Pivot the who_aggregated_df DataFrame
who_pivot_df = who_aggregated_df.pivot_table(index=["DIM_GEO_NAME", "DIM_GEO_CODE", "DIM_TIME_YEAR"], 
                                             columns="IND_CODE", 
                                             values="VALUE_NUMERIC")
# Reset index
who_pivot_df.reset_index(inplace=True)

who_pivot_df.head()

IND_CODE,DIM_GEO_NAME,DIM_GEO_CODE,DIM_TIME_YEAR,AMR_INFECT_ECOLI,AMR_INFECT_MRSA,FINPROTECTION_CATA_TOT_10_POP,FINPROTECTION_CATA_TOT_25_POP,GHED_GGHE-DGGE_SHA2011,GLASSAMC_AWARE,HWF_0001,...,SUD_TREATMENTSERVICES_COVERAGE,UHC_INDEX_REPORTED,VACCINEPREVENTABLE_WILDPOLIO,VIOLENCE_HOMICIDERATE,WHOSIS_000003,WHS4_100,WSH_DOMESTIC_WASTE_SAFELY_TREATED,WSH_HYGIENE_BASIC,WSH_SANITATION_SAFELY_MANAGED,WSH_WATER_SAFELY_MANAGED
0,Afghanistan,AFG,2016,,,,,,,,...,,,,,,,,,,
1,Afghanistan,AFG,2018,,,,,,,,...,,,,,,,,,,
2,Afghanistan,AFG,2019,,,,,,,,...,,,,,,,,,,
3,Afghanistan,AFG,2020,,,26.08,8.03,,,2.535,...,,,,,,,,,,
4,Afghanistan,AFG,2021,,,,,4.05,,,...,,40.884609,,8.37823,,,,,,


In [24]:
# Create final version of the WHO df by country merging the ISO code for Country, Region and Sub Region to avoid the dummie columns option
who_country_ml_df = pd.merge(who_pivot_df, country_regions_df, left_on="DIM_GEO_CODE", right_on="alpha-3", how="left")
who_country_ml_df.head()

Unnamed: 0,DIM_GEO_NAME,DIM_GEO_CODE,DIM_TIME_YEAR,AMR_INFECT_ECOLI,AMR_INFECT_MRSA,FINPROTECTION_CATA_TOT_10_POP,FINPROTECTION_CATA_TOT_25_POP,GHED_GGHE-DGGE_SHA2011,GLASSAMC_AWARE,HWF_0001,...,WHOSIS_000003,WHS4_100,WSH_DOMESTIC_WASTE_SAFELY_TREATED,WSH_HYGIENE_BASIC,WSH_SANITATION_SAFELY_MANAGED,WSH_WATER_SAFELY_MANAGED,name,alpha-3,region,sub-region
0,Afghanistan,AFG,2016,,,,,,,,...,,,,,,,Afghanistan,AFG,Asia,Southern Asia
1,Afghanistan,AFG,2018,,,,,,,,...,,,,,,,Afghanistan,AFG,Asia,Southern Asia
2,Afghanistan,AFG,2019,,,,,,,,...,,,,,,,Afghanistan,AFG,Asia,Southern Asia
3,Afghanistan,AFG,2020,,,26.08,8.03,,,2.535,...,,,,,,,Afghanistan,AFG,Asia,Southern Asia
4,Afghanistan,AFG,2021,,,,,4.05,,,...,,,,,,,Afghanistan,AFG,Asia,Southern Asia


In [25]:
# Filter by NaN on the "name" column to confirm they are same found earlier
nan = who_country_ml_df.loc[who_country_ml_df["name"].isna(), :]
print(f'Number of records in the who_country_ml_df: {len(who_country_ml_df)}')
print(f'Unique NaN DIM_GEO_CODE: {nan["DIM_GEO_CODE"].unique()}')
print(f'Number of NaN records: {len(nan)}')

Number of records in the who_country_ml_df: 1415
Unique NaN DIM_GEO_CODE: ['AFR' 'EMR' 'EUR' 'GLOBAL' 'AMR' 'SEAR' 'WPR']
Number of NaN records: 46


In [26]:
# Drop NaN on the "name" column
who_country_ml_df = who_country_ml_df.dropna(subset = ["name"])
who_country_ml_df

Unnamed: 0,DIM_GEO_NAME,DIM_GEO_CODE,DIM_TIME_YEAR,AMR_INFECT_ECOLI,AMR_INFECT_MRSA,FINPROTECTION_CATA_TOT_10_POP,FINPROTECTION_CATA_TOT_25_POP,GHED_GGHE-DGGE_SHA2011,GLASSAMC_AWARE,HWF_0001,...,WHOSIS_000003,WHS4_100,WSH_DOMESTIC_WASTE_SAFELY_TREATED,WSH_HYGIENE_BASIC,WSH_SANITATION_SAFELY_MANAGED,WSH_WATER_SAFELY_MANAGED,name,alpha-3,region,sub-region
0,Afghanistan,AFG,2016,,,,,,,,...,,,,,,,Afghanistan,AFG,Asia,Southern Asia
1,Afghanistan,AFG,2018,,,,,,,,...,,,,,,,Afghanistan,AFG,Asia,Southern Asia
2,Afghanistan,AFG,2019,,,,,,,,...,,,,,,,Afghanistan,AFG,Asia,Southern Asia
3,Afghanistan,AFG,2020,,,26.08,8.03,,,2.535,...,,,,,,,Afghanistan,AFG,Asia,Southern Asia
4,Afghanistan,AFG,2021,,,,,4.05,,,...,,,,,,,Afghanistan,AFG,Asia,Southern Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1410,"occupied Palestinian territory, including east...",PSE,2018,,,,,,,21.677,...,,,,,,,"Palestine, State of",PSE,Asia,Western Asia
1411,"occupied Palestinian territory, including east...",PSE,2019,,,,,,,,...,,,,,,,"Palestine, State of",PSE,Asia,Western Asia
1412,"occupied Palestinian territory, including east...",PSE,2020,,,,,,,,...,,,,,,,"Palestine, State of",PSE,Asia,Western Asia
1413,"occupied Palestinian territory, including east...",PSE,2021,46.23,73.300003,,,,,,...,,,,,,,"Palestine, State of",PSE,Asia,Western Asia


In [27]:
# Count valid entries per column
who_country_ml_df.count()

DIM_GEO_NAME                1369
DIM_GEO_CODE                1369
DIM_TIME_YEAR               1369
AMR_INFECT_ECOLI              76
AMR_INFECT_MRSA               77
                            ... 
WSH_WATER_SAFELY_MANAGED     121
name                        1369
alpha-3                     1369
region                      1369
sub-region                  1369
Length: 63, dtype: int64

In [28]:
# Drop duplicate columns: "name", "alpha-3"
who_country_ml_df = who_country_ml_df.drop(columns=["name", "alpha-3"])
who_country_ml_df.count()

DIM_GEO_NAME                     1369
DIM_GEO_CODE                     1369
DIM_TIME_YEAR                    1369
AMR_INFECT_ECOLI                   76
AMR_INFECT_MRSA                    77
                                 ... 
WSH_HYGIENE_BASIC                  82
WSH_SANITATION_SAFELY_MANAGED     125
WSH_WATER_SAFELY_MANAGED          121
region                           1369
sub-region                       1369
Length: 61, dtype: int64

## Merge the WHO_country df with the World Happiness Report Data to get the target column "Healthy life expectancy at birth"

In [29]:
# Load the excel file
world_happiness_df = pd.read_excel("Resources/World Happiness Report Data.xlsx")
world_happiness_df.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.164055,0.881686,0.414297,0.258195
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.187297,0.850035,0.481421,0.237092
2,Afghanistan,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.117861,0.706766,0.516907,0.275324
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.160098,0.731109,0.479835,0.267175
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.234157,0.77562,0.613513,0.267919


In [30]:
# Get an adjusted version of the World Happiness with only "Country name", "year", "Healthy life expectancy at birth"
world_happiness_target_df = world_happiness_df[["Country name", "year", "Healthy life expectancy at birth"]]
world_happiness_target_df.head()


Unnamed: 0,Country name,year,Healthy life expectancy at birth
0,Afghanistan,2008,50.5
1,Afghanistan,2009,50.799999
2,Afghanistan,2010,51.099998
3,Afghanistan,2011,51.400002
4,Afghanistan,2012,51.700001


In [31]:
# Merge the world_happiness_target_df with the country_regions_df
world_happiness_target_alpha3_df = pd.merge(world_happiness_target_df, country_regions_df, left_on="Country name", right_on="name", how="left")
world_happiness_target_alpha3_df

Unnamed: 0,Country name,year,Healthy life expectancy at birth,name,alpha-3,region,sub-region
0,Afghanistan,2008,50.500000,Afghanistan,AFG,Asia,Southern Asia
1,Afghanistan,2009,50.799999,Afghanistan,AFG,Asia,Southern Asia
2,Afghanistan,2010,51.099998,Afghanistan,AFG,Asia,Southern Asia
3,Afghanistan,2011,51.400002,Afghanistan,AFG,Asia,Southern Asia
4,Afghanistan,2012,51.700001,Afghanistan,AFG,Asia,Southern Asia
...,...,...,...,...,...,...,...
2358,Zimbabwe,2019,53.099998,Zimbabwe,ZWE,Africa,Sub-Saharan Africa
2359,Zimbabwe,2020,53.575001,Zimbabwe,ZWE,Africa,Sub-Saharan Africa
2360,Zimbabwe,2021,54.049999,Zimbabwe,ZWE,Africa,Sub-Saharan Africa
2361,Zimbabwe,2022,54.525002,Zimbabwe,ZWE,Africa,Sub-Saharan Africa


In [32]:
# Filter by NaN on the "Country name" to find countries with spelling mismatch
nan2 = world_happiness_target_alpha3_df.loc[world_happiness_target_alpha3_df["name"].isna(), :]
print(f'Unique NaN Country name: {nan2["Country name"].unique()}')

Unique NaN Country name: ['Bolivia' 'Congo (Brazzaville)' 'Congo (Kinshasa)'
 'Hong Kong S.A.R. of China' 'Iran' 'Kosovo' 'Laos' 'Moldova'
 'Netherlands' 'Russia' 'Somaliland region' 'South Korea'
 'State of Palestine' 'Syria' 'Taiwan Province of China' 'Tanzania'
 'United Kingdom' 'United States' 'Venezuela']


In [33]:
# Replace the country name on the world_happiness_df
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('Bolivia', 'Bolivia, Plurinational State of')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('Congo (Brazzaville)', 'Congo')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('Congo (Kinshasa)', 'Congo, Democratic Republic of the')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('Hong Kong S.A.R. of China', 'Hong Kong')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('Iran', 'Iran, Islamic Republic of')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('Laos', "Lao People's Democratic Republic")
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('Moldova', 'Moldova, Republic of')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('Netherlands', 'Netherlands, Kingdom of the')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('Russia', 'Russian Federation')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('Somaliland region', 'Somalia')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('South Korea', 'Korea, Republic of')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('State of Palestine', 'Palestine, State of')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('Syria', 'Syrian Arab Republic')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('Taiwan Province of China', 'Taiwan, Province of China')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('Tanzania', 'Tanzania, United Republic of')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('United Kingdom', 'United Kingdom of Great Britain and Northern Ireland')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('United States', 'United States of America')
world_happiness_df['Country name'] = world_happiness_df['Country name'].replace('Venezuela', 'Venezuela, Bolivarian Republic of')

In [34]:
# Create a new DataFrame for Kosovo
new_country = pd.DataFrame({"name": ["Kosovo"],
                            "alpha-3": ["XKX"],
                            "region": ["Europe"],
                            "sub-region": ["Southern Europe"]})

# Concatenate the new country to the existing DataFrame
country_regions_df = pd.concat([country_regions_df, new_country], ignore_index=True)

In [35]:
# Get an updated version of world_happiness_target_df
world_happiness_target_df = world_happiness_df[["Country name", "year", "Healthy life expectancy at birth"]]
world_happiness_target_df.head()

Unnamed: 0,Country name,year,Healthy life expectancy at birth
0,Afghanistan,2008,50.5
1,Afghanistan,2009,50.799999
2,Afghanistan,2010,51.099998
3,Afghanistan,2011,51.400002
4,Afghanistan,2012,51.700001


In [36]:
# Remerge the world_happiness_target_df with the country_regions_df
world_happiness_target_alpha3_df = pd.merge(world_happiness_target_df, country_regions_df, left_on="Country name", right_on="name", how="left")
world_happiness_target_alpha3_df

Unnamed: 0,Country name,year,Healthy life expectancy at birth,name,alpha-3,region,sub-region
0,Afghanistan,2008,50.500000,Afghanistan,AFG,Asia,Southern Asia
1,Afghanistan,2009,50.799999,Afghanistan,AFG,Asia,Southern Asia
2,Afghanistan,2010,51.099998,Afghanistan,AFG,Asia,Southern Asia
3,Afghanistan,2011,51.400002,Afghanistan,AFG,Asia,Southern Asia
4,Afghanistan,2012,51.700001,Afghanistan,AFG,Asia,Southern Asia
...,...,...,...,...,...,...,...
2358,Zimbabwe,2019,53.099998,Zimbabwe,ZWE,Africa,Sub-Saharan Africa
2359,Zimbabwe,2020,53.575001,Zimbabwe,ZWE,Africa,Sub-Saharan Africa
2360,Zimbabwe,2021,54.049999,Zimbabwe,ZWE,Africa,Sub-Saharan Africa
2361,Zimbabwe,2022,54.525002,Zimbabwe,ZWE,Africa,Sub-Saharan Africa


In [37]:
# Confirm there aren't any NaN on the "Country name" to find countries with spelling mismatch
nan3 = world_happiness_target_alpha3_df.loc[world_happiness_target_alpha3_df["name"].isna(), :]
print(f'Unique NaN Country name: {nan3["Country name"].unique()}')

Unique NaN Country name: []


In [38]:
# Drop unnecessary columns
world_happiness_target_alpha3_df = world_happiness_target_alpha3_df.drop(columns=["Country name", "name", "region", "sub-region"])
world_happiness_target_alpha3_df

Unnamed: 0,year,Healthy life expectancy at birth,alpha-3
0,2008,50.500000,AFG
1,2009,50.799999,AFG
2,2010,51.099998,AFG
3,2011,51.400002,AFG
4,2012,51.700001,AFG
...,...,...,...
2358,2019,53.099998,ZWE
2359,2020,53.575001,ZWE
2360,2021,54.049999,ZWE
2361,2022,54.525002,ZWE


In [39]:
# Define a variable for the who_country_ml features DF with NaN
who_country_nan_ml_df = who_country_ml_df


In [40]:
# Merge the target to the who_country_ml_df with NaN based on alpha-3 and year
who_country_nan_ml_df = pd.merge(who_country_nan_ml_df, world_happiness_target_alpha3_df, left_on=["DIM_GEO_CODE", "DIM_TIME_YEAR"], right_on=["alpha-3", "year"], how="left")
who_country_nan_ml_df.head()

Unnamed: 0,DIM_GEO_NAME,DIM_GEO_CODE,DIM_TIME_YEAR,AMR_INFECT_ECOLI,AMR_INFECT_MRSA,FINPROTECTION_CATA_TOT_10_POP,FINPROTECTION_CATA_TOT_25_POP,GHED_GGHE-DGGE_SHA2011,GLASSAMC_AWARE,HWF_0001,...,WHS4_100,WSH_DOMESTIC_WASTE_SAFELY_TREATED,WSH_HYGIENE_BASIC,WSH_SANITATION_SAFELY_MANAGED,WSH_WATER_SAFELY_MANAGED,region,sub-region,year,Healthy life expectancy at birth,alpha-3
0,Afghanistan,AFG,2016,,,,,,,,...,,,,,,Asia,Southern Asia,2016.0,52.924999,AFG
1,Afghanistan,AFG,2018,,,,,,,,...,,,,,,Asia,Southern Asia,2018.0,53.575001,AFG
2,Afghanistan,AFG,2019,,,,,,,,...,,,,,,Asia,Southern Asia,2019.0,53.900002,AFG
3,Afghanistan,AFG,2020,,,26.08,8.03,,,2.535,...,,,,,,Asia,Southern Asia,,,
4,Afghanistan,AFG,2021,,,,,4.05,,,...,,,,,,Asia,Southern Asia,2021.0,54.549999,AFG


In [41]:
# Drop unnecessary columns
who_country_nan_ml_df = who_country_nan_ml_df.drop(columns=["year", "alpha-3"])
who_country_nan_ml_df.head()

Unnamed: 0,DIM_GEO_NAME,DIM_GEO_CODE,DIM_TIME_YEAR,AMR_INFECT_ECOLI,AMR_INFECT_MRSA,FINPROTECTION_CATA_TOT_10_POP,FINPROTECTION_CATA_TOT_25_POP,GHED_GGHE-DGGE_SHA2011,GLASSAMC_AWARE,HWF_0001,...,VIOLENCE_HOMICIDERATE,WHOSIS_000003,WHS4_100,WSH_DOMESTIC_WASTE_SAFELY_TREATED,WSH_HYGIENE_BASIC,WSH_SANITATION_SAFELY_MANAGED,WSH_WATER_SAFELY_MANAGED,region,sub-region,Healthy life expectancy at birth
0,Afghanistan,AFG,2016,,,,,,,,...,,,,,,,,Asia,Southern Asia,52.924999
1,Afghanistan,AFG,2018,,,,,,,,...,,,,,,,,Asia,Southern Asia,53.575001
2,Afghanistan,AFG,2019,,,,,,,,...,,,,,,,,Asia,Southern Asia,53.900002
3,Afghanistan,AFG,2020,,,26.08,8.03,,,2.535,...,,,,,,,,Asia,Southern Asia,
4,Afghanistan,AFG,2021,,,,,4.05,,,...,8.37823,,,,,,,Asia,Southern Asia,54.549999


In [42]:
# Count NaN on the target column "Healthy life expectancy at birth"
target_nan = who_country_nan_ml_df.loc[who_country_nan_ml_df["Healthy life expectancy at birth"].isna(), :]
print(f'Total amount of records: {who_country_nan_ml_df["DIM_GEO_NAME"].count()}')
print(f'Count target value NaN: {target_nan["DIM_GEO_NAME"].count()}')

Total amount of records: 1369
Count target value NaN: 442


**Unfortunatelly we will loss 32.3% of the records for the ML model**

In [43]:
# Drop the NaN on the target column "Healthy life expectancy at birth"
who_country_nan_ml_df = who_country_nan_ml_df.dropna(subset = ["Healthy life expectancy at birth"])
who_country_nan_ml_df.head()

Unnamed: 0,DIM_GEO_NAME,DIM_GEO_CODE,DIM_TIME_YEAR,AMR_INFECT_ECOLI,AMR_INFECT_MRSA,FINPROTECTION_CATA_TOT_10_POP,FINPROTECTION_CATA_TOT_25_POP,GHED_GGHE-DGGE_SHA2011,GLASSAMC_AWARE,HWF_0001,...,VIOLENCE_HOMICIDERATE,WHOSIS_000003,WHS4_100,WSH_DOMESTIC_WASTE_SAFELY_TREATED,WSH_HYGIENE_BASIC,WSH_SANITATION_SAFELY_MANAGED,WSH_WATER_SAFELY_MANAGED,region,sub-region,Healthy life expectancy at birth
0,Afghanistan,AFG,2016,,,,,,,,...,,,,,,,,Asia,Southern Asia,52.924999
1,Afghanistan,AFG,2018,,,,,,,,...,,,,,,,,Asia,Southern Asia,53.575001
2,Afghanistan,AFG,2019,,,,,,,,...,,,,,,,,Asia,Southern Asia,53.900002
4,Afghanistan,AFG,2021,,,,,4.05,,,...,8.37823,,,,,,,Asia,Southern Asia,54.549999
5,Afghanistan,AFG,2022,,,,,,,,...,,35.489811,69.0,,48.214699,,30.0341,Asia,Southern Asia,54.875


In [44]:
who_country_nan_ml_df.columns

Index(['DIM_GEO_NAME', 'DIM_GEO_CODE', 'DIM_TIME_YEAR', 'AMR_INFECT_ECOLI',
       'AMR_INFECT_MRSA', 'FINPROTECTION_CATA_TOT_10_POP',
       'FINPROTECTION_CATA_TOT_25_POP', 'GHED_GGHE-DGGE_SHA2011',
       'GLASSAMC_AWARE', 'HWF_0001', 'HWF_0006', 'HWF_0010', 'HWF_0014',
       'MALARIA_EST_INCIDENCE', 'MCV2', 'MDG_0000000003', 'MDG_0000000007',
       'MDG_0000000020', 'MDG_0000000025', 'MDG_0000000026',
       'M_Est_tob_curr_std', 'NCDMORT3070', 'NCD_BMI_30A', 'NCD_BMI_PLUS2C',
       'NCD_HYP_PREVALENCE_A', 'NUTOVERWEIGHTPREV',
       'NUTRITION_ANAEMIA_REPRODUCTIVEAGE_PREV', 'NUTRITION_WH_2',
       'NUTSTUNTINGPREV', 'PCV3', 'PHE_HHAIR_PROP_POP_CLEAN_FUELS', 'RS_198',
       'SA_0000001688', 'SDGAIRBODA', 'SDGFPALL',
       'SDGHEALTHFACILITIESESSENTIALMEDS', 'SDGHEPHBSAGPRV', 'SDGHIV',
       'SDGHPVRECEIVED', 'SDGIHR2021', 'SDGIPV12M', 'SDGIPVLT',
       'SDGNTDTREATMENT', 'SDGODA01', 'SDGODAWS', 'SDGPM25', 'SDGPOISON',
       'SDGSUICIDE', 'SDGWSHBOD', 'SUD_TREATMENTSERVICES

In [45]:
# Reorganize the order of the columns => Geographic info, year, features, and target
who_country_nan_ml_df = who_country_nan_ml_df[['DIM_GEO_NAME', 'DIM_GEO_CODE', 'region', 'sub-region', 'DIM_TIME_YEAR', 'AMR_INFECT_ECOLI',
                                              'AMR_INFECT_MRSA', 'FINPROTECTION_CATA_TOT_10_POP', 'FINPROTECTION_CATA_TOT_25_POP', 
                                              'GHED_GGHE-DGGE_SHA2011', 'GLASSAMC_AWARE', 'HWF_0001', 'HWF_0006', 'HWF_0010', 'HWF_0014',
                                              'MALARIA_EST_INCIDENCE', 'MCV2', 'MDG_0000000003', 'MDG_0000000007', 'MDG_0000000020', 
                                              'MDG_0000000025', 'MDG_0000000026', 'M_Est_tob_curr_std', 'NCDMORT3070', 'NCD_BMI_30A', 
                                              'NCD_BMI_PLUS2C', 'NCD_HYP_PREVALENCE_A', 'NUTOVERWEIGHTPREV', 'NUTRITION_ANAEMIA_REPRODUCTIVEAGE_PREV', 
                                              'NUTRITION_WH_2', 'NUTSTUNTINGPREV', 'PCV3', 'PHE_HHAIR_PROP_POP_CLEAN_FUELS', 'RS_198', 'SA_0000001688', 
                                              'SDGAIRBODA', 'SDGFPALL', 'SDGHEALTHFACILITIESESSENTIALMEDS', 'SDGHEPHBSAGPRV', 'SDGHIV', 'SDGHPVRECEIVED', 
                                              'SDGIHR2021', 'SDGIPV12M', 'SDGIPVLT', 'SDGNTDTREATMENT', 'SDGODA01', 'SDGODAWS', 'SDGPM25', 'SDGPOISON', 
                                              'SDGSUICIDE', 'SDGWSHBOD', 'SUD_TREATMENTSERVICES_COVERAGE', 'UHC_INDEX_REPORTED', 'VACCINEPREVENTABLE_WILDPOLIO',
                                              'VIOLENCE_HOMICIDERATE', 'WHOSIS_000003', 'WHS4_100', 'WSH_DOMESTIC_WASTE_SAFELY_TREATED', 'WSH_HYGIENE_BASIC',
                                              'WSH_SANITATION_SAFELY_MANAGED', 'WSH_WATER_SAFELY_MANAGED', 'Healthy life expectancy at birth']]
who_country_nan_ml_df

Unnamed: 0,DIM_GEO_NAME,DIM_GEO_CODE,region,sub-region,DIM_TIME_YEAR,AMR_INFECT_ECOLI,AMR_INFECT_MRSA,FINPROTECTION_CATA_TOT_10_POP,FINPROTECTION_CATA_TOT_25_POP,GHED_GGHE-DGGE_SHA2011,...,UHC_INDEX_REPORTED,VACCINEPREVENTABLE_WILDPOLIO,VIOLENCE_HOMICIDERATE,WHOSIS_000003,WHS4_100,WSH_DOMESTIC_WASTE_SAFELY_TREATED,WSH_HYGIENE_BASIC,WSH_SANITATION_SAFELY_MANAGED,WSH_WATER_SAFELY_MANAGED,Healthy life expectancy at birth
0,Afghanistan,AFG,Asia,Southern Asia,2016,,,,,,...,,,,,,,,,,52.924999
1,Afghanistan,AFG,Asia,Southern Asia,2018,,,,,,...,,,,,,,,,,53.575001
2,Afghanistan,AFG,Asia,Southern Asia,2019,,,,,,...,,,,,,,,,,53.900002
4,Afghanistan,AFG,Asia,Southern Asia,2021,,,,,4.05,...,40.884609,,8.378230,,,,,,,54.549999
5,Afghanistan,AFG,Asia,Southern Asia,2022,,,,,,...,,,,35.489811,69.0,,48.214699,,30.03410,54.875000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1358,Zimbabwe,ZWE,Africa,Sub-Saharan Africa,2019,,,,,,...,,,,,,,,,,53.099998
1359,Zimbabwe,ZWE,Africa,Sub-Saharan Africa,2020,,,,,,...,,,,,,,,,,53.575001
1360,Zimbabwe,ZWE,Africa,Sub-Saharan Africa,2021,,,,,5.21,...,55.042839,,12.219641,,,,,,,54.049999
1361,Zimbabwe,ZWE,Africa,Sub-Saharan Africa,2022,,,,,,...,,,,24.264170,90.0,54.782791,42.456089,31.806431,26.51643,54.525002


In [46]:
# Write CSV file of the dataset with NaN
who_country_nan_ml_df.to_csv("Datasets_ML/WHO_country_ML_wNaN.csv", index=False, header=True)

In [47]:
# Define a variable for the features DF WITHOUT NaN => Replace NaN with -1
who_country_no_nan_ml_df = who_country_nan_ml_df.fillna(-1, inplace=False)

In [48]:
# Write CSV file of the dataset WITHOUT NaN
who_country_no_nan_ml_df.to_csv("Datasets_ML/WHO_country_ML.csv", index=False, header=True)

## Do the ETL process on the World Happiness Report Data
1. Get the World Happiness DF for the ML model process including region and subregion

In [49]:
# Reload the World Happiness DF
world_happiness_df.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.164055,0.881686,0.414297,0.258195
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.187297,0.850035,0.481421,0.237092
2,Afghanistan,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.117861,0.706766,0.516907,0.275324
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.160098,0.731109,0.479835,0.267175
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.234157,0.77562,0.613513,0.267919


In [50]:
# Merge the world_happiness_df with the country_regions_df
world_happiness_ml_df = pd.merge(world_happiness_df, country_regions_df, left_on="Country name", right_on="name", how="left")
world_happiness_ml_df

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,name,alpha-3,region,sub-region
0,Afghanistan,2008,3.723590,7.350416,0.450662,50.500000,0.718114,0.164055,0.881686,0.414297,0.258195,Afghanistan,AFG,Asia,Southern Asia
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.187297,0.850035,0.481421,0.237092,Afghanistan,AFG,Asia,Southern Asia
2,Afghanistan,2010,4.758381,7.613900,0.539075,51.099998,0.600127,0.117861,0.706766,0.516907,0.275324,Afghanistan,AFG,Asia,Southern Asia
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.160098,0.731109,0.479835,0.267175,Afghanistan,AFG,Asia,Southern Asia
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.234157,0.775620,0.613513,0.267919,Afghanistan,AFG,Asia,Southern Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2358,Zimbabwe,2019,2.693523,7.697755,0.759162,53.099998,0.631908,-0.050874,0.830652,0.658434,0.235354,Zimbabwe,ZWE,Africa,Sub-Saharan Africa
2359,Zimbabwe,2020,3.159802,7.596050,0.717243,53.575001,0.643303,0.002848,0.788523,0.660658,0.345736,Zimbabwe,ZWE,Africa,Sub-Saharan Africa
2360,Zimbabwe,2021,3.154578,7.656878,0.685151,54.049999,0.667636,-0.079007,0.756945,0.609917,0.241682,Zimbabwe,ZWE,Africa,Sub-Saharan Africa
2361,Zimbabwe,2022,3.296220,7.670073,0.666172,54.525002,0.651987,-0.072935,0.752632,0.640609,0.191350,Zimbabwe,ZWE,Africa,Sub-Saharan Africa


In [51]:
# Confirm there aren't any NaN on the "Country name" to find countries with spelling mismatch
nan4 = world_happiness_ml_df.loc[world_happiness_ml_df["name"].isna(), :]
print(f'Unique NaN Country name: {nan4["Country name"].unique()}')

Unique NaN Country name: []


In [52]:
# Drop repeat country name column
world_happiness_ml_df = world_happiness_ml_df.drop(columns="name")
world_happiness_ml_df

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,alpha-3,region,sub-region
0,Afghanistan,2008,3.723590,7.350416,0.450662,50.500000,0.718114,0.164055,0.881686,0.414297,0.258195,AFG,Asia,Southern Asia
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.187297,0.850035,0.481421,0.237092,AFG,Asia,Southern Asia
2,Afghanistan,2010,4.758381,7.613900,0.539075,51.099998,0.600127,0.117861,0.706766,0.516907,0.275324,AFG,Asia,Southern Asia
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.160098,0.731109,0.479835,0.267175,AFG,Asia,Southern Asia
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.234157,0.775620,0.613513,0.267919,AFG,Asia,Southern Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2358,Zimbabwe,2019,2.693523,7.697755,0.759162,53.099998,0.631908,-0.050874,0.830652,0.658434,0.235354,ZWE,Africa,Sub-Saharan Africa
2359,Zimbabwe,2020,3.159802,7.596050,0.717243,53.575001,0.643303,0.002848,0.788523,0.660658,0.345736,ZWE,Africa,Sub-Saharan Africa
2360,Zimbabwe,2021,3.154578,7.656878,0.685151,54.049999,0.667636,-0.079007,0.756945,0.609917,0.241682,ZWE,Africa,Sub-Saharan Africa
2361,Zimbabwe,2022,3.296220,7.670073,0.666172,54.525002,0.651987,-0.072935,0.752632,0.640609,0.191350,ZWE,Africa,Sub-Saharan Africa


In [53]:
world_happiness_ml_df.columns

Index(['Country name', 'year', 'Life Ladder', 'Log GDP per capita',
       'Social support', 'Healthy life expectancy at birth',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Positive affect', 'Negative affect',
       'alpha-3', 'region', 'sub-region'],
      dtype='object')

In [54]:
# Reorganize the order of the columns => Geographic info, year, features, and target
world_happiness_ml_df = world_happiness_ml_df[['Country name', 'alpha-3', 'region', 'sub-region', 'year', 'Log GDP per capita',
                                               'Social support', 'Healthy life expectancy at birth', 'Freedom to make life choices', 
                                               'Generosity', 'Perceptions of corruption', 'Positive affect', 'Negative affect',
                                               'Life Ladder']]
world_happiness_ml_df

Unnamed: 0,Country name,alpha-3,region,sub-region,year,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Life Ladder
0,Afghanistan,AFG,Asia,Southern Asia,2008,7.350416,0.450662,50.500000,0.718114,0.164055,0.881686,0.414297,0.258195,3.723590
1,Afghanistan,AFG,Asia,Southern Asia,2009,7.508646,0.552308,50.799999,0.678896,0.187297,0.850035,0.481421,0.237092,4.401778
2,Afghanistan,AFG,Asia,Southern Asia,2010,7.613900,0.539075,51.099998,0.600127,0.117861,0.706766,0.516907,0.275324,4.758381
3,Afghanistan,AFG,Asia,Southern Asia,2011,7.581259,0.521104,51.400002,0.495901,0.160098,0.731109,0.479835,0.267175,3.831719
4,Afghanistan,AFG,Asia,Southern Asia,2012,7.660506,0.520637,51.700001,0.530935,0.234157,0.775620,0.613513,0.267919,3.782938
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2358,Zimbabwe,ZWE,Africa,Sub-Saharan Africa,2019,7.697755,0.759162,53.099998,0.631908,-0.050874,0.830652,0.658434,0.235354,2.693523
2359,Zimbabwe,ZWE,Africa,Sub-Saharan Africa,2020,7.596050,0.717243,53.575001,0.643303,0.002848,0.788523,0.660658,0.345736,3.159802
2360,Zimbabwe,ZWE,Africa,Sub-Saharan Africa,2021,7.656878,0.685151,54.049999,0.667636,-0.079007,0.756945,0.609917,0.241682,3.154578
2361,Zimbabwe,ZWE,Africa,Sub-Saharan Africa,2022,7.670073,0.666172,54.525002,0.651987,-0.072935,0.752632,0.640609,0.191350,3.296220


In [55]:
# Write CSV file of the dataset
world_happiness_ml_df.to_csv("Datasets_ML/World_Happiness_ML.csv", index=False, header=True)