# Data Cleaning

## 1. Loading the Data

In [1362]:
import pandas as pd
import numpy as np

In [1363]:
forest_area = pd.read_csv("dataset/Forest Area.csv")
freshwater = pd.read_csv("dataset/Freshwater.csv", skiprows=3)
gdp_per_capita = pd.read_csv("dataset/gdp_per_capita.csv")
governance = pd.read_csv("dataset/Governance.csv")
human_development_index = pd.read_csv("dataset/human-development-index.csv")
sustainable_energy = pd.read_csv("dataset/Sustainable energy for all.csv", skiprows=3)
emissions = pd.read_csv("dataset/trends_in_greenhouse_gas_emissions.csv", skiprows=3)

## 2. Choosing Relevant Variables

This is done based on the theoretical framework. As the datasets I have mostly encompasses a wide range of different variables, I will be omitting the ones that are not used and also take the most recent data only.

### Forest Area - 2020 Forest Area

In [1364]:
# Forest area
forest_area_filtered = forest_area[['Country and Area', 'Forest Area, 2020 (1000 ha)']]
forest_area_filtered = forest_area_filtered.rename(columns={'Country and Area': 'Country'})
forest_area_filtered.head()

Unnamed: 0,Country,"Forest Area, 2020 (1000 ha)"
0,WORLD,4058930.81
1,Afghanistan,1208.44
2,Albania,788.9
3,Algeria,1949.0
4,American Samoa,17.13


### Freshwater

In [1365]:
freshwater_filtered = freshwater[['Country', 'Internal renewable freshwater resources flows 2020']]
freshwater_filtered.head()

Unnamed: 0,Country,Internal renewable freshwater resources flows 2020
0,Afghanistan,47.2
1,Albania,26.9
2,Algeria,11.2
3,American Samoa,..
4,Andorra,0.3


### GDP per Capita

In [1366]:
gdp_per_capita_filtered = gdp_per_capita[['Country Name', '2021 [YR2021]']]
gdp_per_capita_filtered = gdp_per_capita_filtered.rename(columns={'Country Name': 'Country'})
gdp_per_capita_filtered = gdp_per_capita_filtered.rename(columns={'2021 [YR2021]': 'GDP per Capita 2021'})
gdp_per_capita_filtered.head()

Unnamed: 0,Country,GDP per Capita 2021
0,Afghanistan,355.777826392648
1,Albania,6377.20309553753
2,Algeria,3700.31469728198
3,American Samoa,16653.7137781725
4,Andorra,42072.3194231234


### Governance

In [1367]:
governance_filtered = governance[['Country and area', 'Paris Agreement', 'UN Framework Convention on Climate Change']]
governance_filtered = governance_filtered.rename(columns={'Country and area': 'Country'})
governance_filtered.head()

Unnamed: 0,Country,Paris Agreement,UN Framework Convention on Climate Change
0,Afghanistan,2017,2002
1,Albania,2016,1994
2,Algeria,2016,1993
3,Andorra,2017,2011
4,Angola,...,2000


### Human Development Index

In [1368]:
human_development_index_filtered = human_development_index.loc[human_development_index['Year'] == 2022]
human_development_index_filtered = human_development_index_filtered[['Entity', 'Human Development Index']]
human_development_index_filtered = human_development_index_filtered.rename(columns={'Entity': 'Country'})
human_development_index_filtered.head()

Unnamed: 0,Country,Human Development Index
32,Afghanistan,0.462
65,Albania,0.789
98,Algeria,0.745
121,Andorra,0.884
145,Angola,0.591


### Sustainable Energy

In [1369]:
sustainable_energy_filtered = sustainable_energy[['Country', 'Access to clean fuels and technologies for cooking 2021', 'Renewable energy consumption 2020']]
sustainable_energy_filtered.head()

Unnamed: 0,Country,Access to clean fuels and technologies for cooking 2021,Renewable energy consumption 2020
0,Afghanistan,35.4,17.6
1,Albania,83.7,44.6
2,Algeria,99.7,0.2
3,American Samoa,..,0.5
4,Andorra,100,21.9


### Emissions

In [1370]:
emissions_filtered = emissions[['Country', 'Carbon dioxide emissions 2020', 'Methane emissions 2020', 'Nitrous oxide emissions 2020']]
emissions_filtered.head()

Unnamed: 0,Country,Carbon dioxide emissions 2020,Methane emissions 2020,Nitrous oxide emissions 2020
0,Afghanistan,8709.47,16222.03596,4863.386801
1,Albania,4383.2,2692.195886,1015.186729
2,Algeria,161563,86543.92362,12578.7476
3,American Samoa,..,..,..
4,Andorra,448.884399,53.600959,1.758811


## 3. Merge Datasets

Remove unnecessary rows after last country (Zimbabwe) for Freshwater, Sustainable Energy, and Emissions dataset

In [1371]:
freshwater_filtered = freshwater_filtered.loc[:freshwater_filtered[freshwater_filtered['Country'] == 'Zimbabwe'].index[0]]
freshwater_filtered

Unnamed: 0,Country,Internal renewable freshwater resources flows 2020
0,Afghanistan,47.2
1,Albania,26.9
2,Algeria,11.2
3,American Samoa,..
4,Andorra,0.3
...,...,...
209,Virgin Islands (U.S.),..
210,West Bank and Gaza,0.8
211,"Yemen, Rep.",2.1
212,Zambia,80.2


In [1372]:
sustainable_energy_filtered = sustainable_energy_filtered.loc[:sustainable_energy_filtered[sustainable_energy_filtered['Country'] == 'Zimbabwe'].index[0]]
sustainable_energy_filtered

Unnamed: 0,Country,Access to clean fuels and technologies for cooking 2021,Renewable energy consumption 2020
0,Afghanistan,35.4,17.6
1,Albania,83.7,44.6
2,Algeria,99.7,0.2
3,American Samoa,..,0.5
4,Andorra,100,21.9
...,...,...,...
209,Virgin Islands (U.S.),..,5.1
210,West Bank and Gaza,..,15
211,"Yemen, Rep.",61.3,3.5
212,Zambia,10.2,81.8


In [1373]:
emissions_filtered = emissions_filtered.loc[:emissions_filtered[emissions_filtered['Country'] == 'Zimbabwe'].index[0]]
emissions_filtered

Unnamed: 0,Country,Carbon dioxide emissions 2020,Methane emissions 2020,Nitrous oxide emissions 2020
0,Afghanistan,8709.47,16222.03596,4863.386801
1,Albania,4383.2,2692.195886,1015.186729
2,Algeria,161563,86543.92362,12578.7476
3,American Samoa,..,..,..
4,Andorra,448.884399,53.600959,1.758811
...,...,...,...,...
209,Virgin Islands (U.S.),..,..,..
210,West Bank and Gaza,..,..,..
211,"Yemen, Rep.",9960.1,10542.15376,3364.858401
212,Zambia,7607.1,15448.75233,12363.49464


Get standardised country names map to merge data into one CSV file as each datasets might have different conventions on naming the countries. 

I did this with the help of GPT-4, but it was not entirely accurate so I also had to input some of the key-value pairs manually.

In [1374]:
country_name_mapping = {
    "Bahamas, The": "Bahamas",
    "Bolivia": "Bolivia (Plurinational State of)",
    "Brunei": "Brunei Darussalam",
    "Congo, Dem. Rep.": "Democratic Republic of the Congo",
    "Congo, Democratic Republic of the": "Democratic Republic of the Congo",
    "The Democratic Republic of the Congo": "Democratic Republic of the Congo",
    "Democratic Republic of Congo": "Democratic Republic of the Congo",
    "Congo, Rep.": "Congo",
    "Congo, Republic of": "Congo",
    "Cape Verde": "Cabo Verde",
    "Cote d'Ivoire": "Côte d’Ivoire",
    "Ivory Coast": "Côte d’Ivoire",
    "Czech Republic": "Czechia",
    "Gambia, The": "Gambia",
    "Korea, Rep.": "Republic of Korea",
    "Korea, Republic of": "Republic of Korea",
    "South Korea": "Republic of Korea",
    "Korea, Dem. People's Rep.": "Democratic People's Republic of Korea",
    "North Korea": "Democratic People's Republic of Korea",
    "Iran, Islamic Rep.": "Iran (Islamic Republic of)",
    "Iran": "Iran (Islamic Republic of)",
    "Lao PDR": "Lao People's Democratic Republic",
    "Lao": "Lao People's Democratic Republic",
    "Laos": "Lao People's Democratic Republic",
    "Micronesia, Fed. Sts.": "Micronesia (Federated States of)",
    "Micronesia": "Micronesia (Federated States of)",
    "Micronesia (country)": "Micronesia (Federated States of)",
    "Moldova": "Republic of Moldova",
    "Palestine, State of": "State of Palestine",
    "St. Kitts and Nevis": "Saint Kitts and Nevis",
    "St. Lucia": "Saint Lucia",
    "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines",
    "Turkiye": "Turkey",
    "Tanzania": "United Republic of Tanzania",
    "Venezuela, RB": "Venezuela",
    "Yemen, Rep.": "Yemen",
    "Syria": "Syrian Arab Republic",
    "Egypt, Arab Rep.": "Egypt",
    "Kyrgyz Republic": "Kyrgyzstan",
    "Slovak Republic": "Slovakia",
    "Russia": "Russian Federation",
    "United Kingdom": "United Kingdom of Great Britain and Northern Ireland",
    "United States": "United States of America",
    "East Timor": "Timor-Leste",
    "Vietnam": "Viet Nam",
    "Venezuela": "Venezuela (Bolivarian Republic of)",
    "Venezuela, RB": "Venezuela (Bolivarian Republic of)",
    "West Bank and Gaza": "State of Palestine",
    "Palestine": "State of Palestine",
}

In [1375]:
print("Forest Area: ", len(forest_area_filtered))
print("Freshwater: ", len(freshwater_filtered))
print("GDP per Capita: ", len(gdp_per_capita_filtered))
print("Governance: ", len(governance_filtered))
print("HDI: ", len(human_development_index_filtered))
print("Sustainable Energy: ", len(sustainable_energy_filtered))
print("Emissions: ", len(emissions_filtered))

Forest Area:  237
Freshwater:  214
GDP per Capita:  271
Governance:  194
HDI:  204
Sustainable Energy:  214
Emissions:  214


As the Governance dataset has the least amount of countries, I'm going to use countries from the Governance dataset as the base country list for the merged data.

In [1376]:
countries_list = governance_filtered['Country'].unique()
countries_list

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria',
       'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo', 'Costa Rica', 'Côte d’Ivoire',
       'Croatia', 'Cuba', 'Cyprus', 'Czechia',
       "Democratic People's Republic of Korea",
       'Democratic Republic of the Congo', 'Denmark', 'Djibouti',
       'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon',
       'Gambia', 'Georgia', 'G

Map the country names from other datasets to match the country name in the Governance dataset.

In [1377]:
forest_area_filtered.loc[:, 'Country'] = forest_area_filtered['Country'].replace(country_name_mapping)
forest_area_filtered = forest_area_filtered[forest_area_filtered['Country'].isin(countries_list)]
len(forest_area_filtered)

194

In [1378]:
freshwater_filtered.loc[:, 'Country'] = freshwater_filtered['Country'].replace(country_name_mapping)
freshwater_filtered = freshwater_filtered[freshwater_filtered['Country'].isin(countries_list)]
len(freshwater_filtered)

193

In [1379]:
gdp_per_capita_filtered.loc[:, 'Country'] = gdp_per_capita_filtered['Country'].replace(country_name_mapping)
gdp_per_capita_filtered = gdp_per_capita_filtered[gdp_per_capita_filtered['Country'].isin(countries_list)]
len(gdp_per_capita_filtered)

194

In [1380]:
human_development_index_filtered.loc[:, 'Country'] = human_development_index_filtered['Country'].replace(country_name_mapping)
human_development_index_filtered = human_development_index_filtered[human_development_index_filtered['Country'].isin(countries_list)]
len(human_development_index_filtered)

192

In [1381]:
sustainable_energy_filtered.loc[:, 'Country'] = sustainable_energy_filtered['Country'].replace(country_name_mapping)
sustainable_energy_filtered = sustainable_energy_filtered[sustainable_energy_filtered['Country'].isin(countries_list)]
len(sustainable_energy_filtered)

193

In [1382]:
emissions_filtered.loc[:, 'Country'] = emissions_filtered['Country'].replace(country_name_mapping)
emissions_filtered = emissions_filtered[emissions_filtered['Country'].isin(countries_list)]
len(emissions_filtered)

193

In [1383]:
freshwater_countries = freshwater_filtered['Country'].unique()
missing_countries = list(set(countries_list) - set(freshwater_countries))
missing_countries

['Nauru']

Input missing country rows with null value to the dataset

In [1384]:
nauru_df = pd.DataFrame([['Nauru', np.nan]], columns=['Country', 'Internal renewable freshwater resources flows 2020'])
freshwater_filtered = pd.concat([freshwater_filtered, nauru_df], ignore_index=True)

In [1385]:
sustainable_energy_countries = sustainable_energy_filtered['Country'].unique()
missing_countries = list(set(countries_list) - set(sustainable_energy_countries))
missing_countries

['Nauru']

In [1386]:
nauru_df = pd.DataFrame([['Nauru', np.nan, np.nan]], columns=['Country', 'Access to clean fuels and technologies for cooking 2021', 'Renewable energy consumption 2020'])
sustainable_energy_filtered = pd.concat([sustainable_energy_filtered, nauru_df], ignore_index=True)

In [1387]:
emissions_countries = emissions_filtered['Country'].unique()
missing_countries = list(set(countries_list) - set(emissions_countries))
missing_countries

['Nauru']

In [1388]:
nauru_df = pd.DataFrame([['Nauru', np.nan, np.nan, np.nan]], columns=['Country', 'Carbon dioxide emissions 2020', 'Methane emissions 2020', 'Nitrous oxide emissions 2020'])
emissions_filtered = pd.concat([emissions_filtered, nauru_df], ignore_index=True)

In [1389]:
hdi_countries = human_development_index_filtered['Country'].unique()
missing_countries = list(set(countries_list) - set(hdi_countries))
missing_countries

["Democratic People's Republic of Korea", 'Monaco']

In [1390]:
monaco_df = pd.DataFrame([['Monaco', np.nan]], columns=['Country', 'Human Development Index'])
dprk_df = pd.DataFrame([["Democratic People's Republic of Korea", np.nan]], columns=['Country', 'Human Development Index'])
human_development_index_filtered = pd.concat([human_development_index_filtered, monaco_df, dprk_df], ignore_index=True)

So in conclusion, I am going to have a total of 194 countries included in the data after inputting the missing countries for each dataset.

In [1391]:
from functools import reduce

In [1392]:
dfs = [forest_area_filtered, freshwater_filtered, gdp_per_capita_filtered, governance_filtered, human_development_index_filtered, sustainable_energy_filtered, emissions_filtered]
merge_dfs = lambda left, right: pd.merge(left, right, on='Country', how='inner')
lcdci_df = reduce(merge_dfs, dfs)
lcdci_df.head()

Unnamed: 0,Country,"Forest Area, 2020 (1000 ha)",Internal renewable freshwater resources flows 2020,GDP per Capita 2021,Paris Agreement,UN Framework Convention on Climate Change,Human Development Index,Access to clean fuels and technologies for cooking 2021,Renewable energy consumption 2020,Carbon dioxide emissions 2020,Methane emissions 2020,Nitrous oxide emissions 2020
0,Afghanistan,1208.44,47.2,355.777826392648,2017,2002,0.462,35.4,17.6,8709.47,16222.03596,4863.386801
1,Albania,788.9,26.9,6377.20309553753,2016,1994,0.789,83.7,44.6,4383.2,2692.195886,1015.186729
2,Algeria,1949.0,11.2,3700.31469728198,2016,1993,0.745,99.7,0.2,161563.0,86543.92362,12578.7476
3,Andorra,16.0,0.3,42072.3194231234,2017,2011,0.884,100.0,21.9,448.884399,53.600959,1.758811
4,Angola,66607.38,148.0,1927.47407832091,...,2000,0.591,50.0,61.0,19814.5,32703.23811,17757.86784


Rename column headings for consistency

In [1393]:
lcdci_df = lcdci_df.rename(columns={'Country': 'country'})
lcdci_df = lcdci_df.rename(columns={'Forest Area, 2020 (1000 ha)': 'forest_area'})
lcdci_df = lcdci_df.rename(columns={'Internal renewable freshwater resources flows 2020': 'freshwater'})
lcdci_df = lcdci_df.rename(columns={'GDP per Capita 2021': 'gdp_per_capita'})
lcdci_df = lcdci_df.rename(columns={'Paris Agreement': 'paris_agreement'})
lcdci_df = lcdci_df.rename(columns={'UN Framework Convention on Climate Change': 'unfccc'})
lcdci_df = lcdci_df.rename(columns={'Human Development Index': 'hdi'})
lcdci_df = lcdci_df.rename(columns={'Access to clean fuels and technologies for cooking 2021': 'cooking_clean_fuel_and_technologies_access'})
lcdci_df = lcdci_df.rename(columns={'Renewable energy consumption 2020': 'renewable_energy_consumption'})
lcdci_df = lcdci_df.rename(columns={'Carbon dioxide emissions 2020': 'co2_emissions'})
lcdci_df = lcdci_df.rename(columns={'Methane emissions 2020': 'methane_emissions'})
lcdci_df = lcdci_df.rename(columns={'Nitrous oxide emissions 2020': 'nitrous_oxide_emissions'})


## 4. Handling Missing Values

In [1394]:
lcdci_df.dtypes

country                                        object
forest_area                                   float64
freshwater                                     object
gdp_per_capita                                 object
paris_agreement                                object
unfccc                                          int64
hdi                                           float64
cooking_clean_fuel_and_technologies_access     object
renewable_energy_consumption                   object
co2_emissions                                  object
methane_emissions                              object
nitrous_oxide_emissions                        object
dtype: object

Handle missing values in columns by marking it as NaN and convert to suitable type

In [1395]:
lcdci_df["freshwater"] = lcdci_df["freshwater"].replace("..", np.nan)
lcdci_df["freshwater"] = lcdci_df["freshwater"].astype(float)

In [1396]:
lcdci_df["gdp_per_capita"] = lcdci_df["gdp_per_capita"].replace("..", np.nan)
lcdci_df["gdp_per_capita"] = lcdci_df["gdp_per_capita"].astype(float)

* One hot encode the Paris Agreement (0: not agreed, 1: agreed)

In [1397]:
lcdci_df["paris_agreement"] = lcdci_df["paris_agreement"].replace("..", 0) 
lcdci_df["paris_agreement"] = lcdci_df["paris_agreement"].apply(lambda x: 1 if x != 0 else 0) 

In [1398]:
lcdci_df["cooking_clean_fuel_and_technologies_access"] = lcdci_df["cooking_clean_fuel_and_technologies_access"].replace("..", np.nan)
lcdci_df["cooking_clean_fuel_and_technologies_access"] = lcdci_df["cooking_clean_fuel_and_technologies_access"].astype(float)

In [1399]:
lcdci_df["renewable_energy_consumption"] = lcdci_df["renewable_energy_consumption"].replace("..", np.nan)
lcdci_df["renewable_energy_consumption"] = lcdci_df["renewable_energy_consumption"].astype(float)

In [1400]:
lcdci_df["co2_emissions"] = lcdci_df["co2_emissions"].replace("..", np.nan)
lcdci_df["co2_emissions"] = lcdci_df["co2_emissions"].astype(float)

In [1401]:
lcdci_df["methane_emissions"] = lcdci_df["methane_emissions"].replace("..", np.nan)
lcdci_df["methane_emissions"] = lcdci_df["methane_emissions"].astype(float)

In [1402]:
lcdci_df["nitrous_oxide_emissions"] = lcdci_df["nitrous_oxide_emissions"].replace("..", np.nan)
lcdci_df["nitrous_oxide_emissions"] = lcdci_df["nitrous_oxide_emissions"].astype(float)

In [1403]:
lcdci_df.isnull().sum()

country                                        0
forest_area                                    0
freshwater                                    13
gdp_per_capita                                 5
paris_agreement                                0
unfccc                                         0
hdi                                            2
cooking_clean_fuel_and_technologies_access     6
renewable_energy_consumption                   3
co2_emissions                                  4
methane_emissions                              4
nitrous_oxide_emissions                        4
dtype: int64

In [1404]:
lcdci_df.dtypes

country                                        object
forest_area                                   float64
freshwater                                    float64
gdp_per_capita                                float64
paris_agreement                                 int64
unfccc                                          int64
hdi                                           float64
cooking_clean_fuel_and_technologies_access    float64
renewable_energy_consumption                  float64
co2_emissions                                 float64
methane_emissions                             float64
nitrous_oxide_emissions                       float64
dtype: object

In [1405]:
# Filter rows where freshwater is empty
freshwater_missing = lcdci_df[lcdci_df['freshwater'].isnull()]
freshwater_missing


Unnamed: 0,country,forest_area,freshwater,gdp_per_capita,paris_agreement,unfccc,hdi,cooking_clean_fuel_and_technologies_access,renewable_energy_consumption,co2_emissions,methane_emissions,nitrous_oxide_emissions
89,Kiribati,1.18,,1766.144289,1,1995,0.628,12.4,42.8,56.9,23.209951,5.124415
98,Liechtenstein,6.7,,197504.548936,1,1994,0.942,,55.2,141.996093,2.588257,0.870988
107,Marshall Islands,9.4,,6130.437137,1,1992,0.731,66.7,12.0,110.0,31.91103,0.699535
111,Micronesia (Federated States of),64.42,,3588.759933,1,1993,0.634,13.3,2.0,107.5,52.686594,27.549723
112,Monaco,0.0,,235132.784182,1,1992,,100.0,,,,
114,Montenegro,827.0,,9465.96153,1,2006,0.844,62.0,39.6,2527.2,783.237121,150.663183
119,Nauru,0.0,,11632.692502,1,1993,0.696,,,,,
130,Palau,41.41,,12921.827321,1,1999,0.797,43.0,0.9,158.2,19.526302,0.0
147,Samoa,161.67,,3857.689742,1,1994,0.702,37.2,37.5,206.5,267.323935,50.494426
148,San Marino,1.0,,54982.451715,1,1994,0.867,100.0,,,,


In [1406]:
hdi_missing = lcdci_df[lcdci_df['hdi'].isnull()]
hdi_missing

Unnamed: 0,country,forest_area,freshwater,gdp_per_capita,paris_agreement,unfccc,hdi,cooking_clean_fuel_and_technologies_access,renewable_energy_consumption,co2_emissions,methane_emissions,nitrous_oxide_emissions
45,Democratic People's Republic of Korea,6030.09,67.0,,1,1994,,12.5,12.7,52437.2,18145.48138,2590.047636
112,Monaco,0.0,,235132.784182,1,1992,,100.0,,,,


In [1407]:
clean_fuel_missing = lcdci_df[lcdci_df['cooking_clean_fuel_and_technologies_access'].isnull()]
clean_fuel_missing

Unnamed: 0,country,forest_area,freshwater,gdp_per_capita,paris_agreement,unfccc,hdi,cooking_clean_fuel_and_technologies_access,renewable_energy_consumption,co2_emissions,methane_emissions,nitrous_oxide_emissions
25,Bulgaria,3893.0,21.0,12219.341871,1,1995,0.799,,21.1,34138.1,6980.496622,4328.729643
94,Lebanon,143.33,4.8,4136.146575,1,1994,0.723,,6.7,21474.9,3459.013432,846.613935
97,Libya,217.0,0.7,5908.951323,1,1999,0.746,,3.1,44467.0,24033.12005,1797.652497
98,Liechtenstein,6.7,,197504.548936,1,1994,0.942,,55.2,141.996093,2.588257,0.870988
119,Nauru,0.0,,11632.692502,1,1993,0.696,,,,,
164,State of Palestine,10.14,0.8,3678.635657,1,2015,0.716,,15.0,,,


In [1408]:
renewable_energy_missing = lcdci_df[lcdci_df['renewable_energy_consumption'].isnull()]
renewable_energy_missing

Unnamed: 0,country,forest_area,freshwater,gdp_per_capita,paris_agreement,unfccc,hdi,cooking_clean_fuel_and_technologies_access,renewable_energy_consumption,co2_emissions,methane_emissions,nitrous_oxide_emissions
112,Monaco,0.0,,235132.784182,1,1992,,100.0,,,,
119,Nauru,0.0,,11632.692502,1,1993,0.696,,,,,
148,San Marino,1.0,,54982.451715,1,1994,0.867,100.0,,,,


In [1409]:
co2_missing = lcdci_df[lcdci_df['co2_emissions'].isnull()]
co2_missing

Unnamed: 0,country,forest_area,freshwater,gdp_per_capita,paris_agreement,unfccc,hdi,cooking_clean_fuel_and_technologies_access,renewable_energy_consumption,co2_emissions,methane_emissions,nitrous_oxide_emissions
112,Monaco,0.0,,235132.784182,1,1992,,100.0,,,,
119,Nauru,0.0,,11632.692502,1,1993,0.696,,,,,
148,San Marino,1.0,,54982.451715,1,1994,0.867,100.0,,,,
164,State of Palestine,10.14,0.8,3678.635657,1,2015,0.716,,15.0,,,


In [1410]:
methane_missing = lcdci_df[lcdci_df['methane_emissions'].isnull()]
methane_missing

Unnamed: 0,country,forest_area,freshwater,gdp_per_capita,paris_agreement,unfccc,hdi,cooking_clean_fuel_and_technologies_access,renewable_energy_consumption,co2_emissions,methane_emissions,nitrous_oxide_emissions
112,Monaco,0.0,,235132.784182,1,1992,,100.0,,,,
119,Nauru,0.0,,11632.692502,1,1993,0.696,,,,,
148,San Marino,1.0,,54982.451715,1,1994,0.867,100.0,,,,
164,State of Palestine,10.14,0.8,3678.635657,1,2015,0.716,,15.0,,,


In [1411]:
nitrous_oxide_missing = lcdci_df[lcdci_df['nitrous_oxide_emissions'].isnull()]
nitrous_oxide_missing

Unnamed: 0,country,forest_area,freshwater,gdp_per_capita,paris_agreement,unfccc,hdi,cooking_clean_fuel_and_technologies_access,renewable_energy_consumption,co2_emissions,methane_emissions,nitrous_oxide_emissions
112,Monaco,0.0,,235132.784182,1,1992,,100.0,,,,
119,Nauru,0.0,,11632.692502,1,1993,0.696,,,,,
148,San Marino,1.0,,54982.451715,1,1994,0.867,100.0,,,,
164,State of Palestine,10.14,0.8,3678.635657,1,2015,0.716,,15.0,,,


### Imputation of Missing Values

The initial GDP dataset obtained from the World Bank contained missing values for certain countries in the year 2021. To address this, I located the missing data through the United Nations Statistics Division. I manually imputed these values into the primary dataset to create a more complete picture for my analysis.

In [1412]:
lcdci_df.loc[lcdci_df['country'] == "Cuba", 'gdp_per_capita'] = 11255
lcdci_df.loc[lcdci_df['country'] == "Democratic People's Republic of Korea", 'gdp_per_capita'] = 654
lcdci_df.loc[lcdci_df['country'] == "Eritrea", 'gdp_per_capita'] = 611
lcdci_df.loc[lcdci_df['country'] == "South Sudan", 'gdp_per_capita'] = 399
lcdci_df.loc[lcdci_df['country'] == "Venezuela (Bolivarian Republic of)", 'gdp_per_capita'] = 3967

In [1413]:
lcdci_df.isnull().sum()

country                                        0
forest_area                                    0
freshwater                                    13
gdp_per_capita                                 0
paris_agreement                                0
unfccc                                         0
hdi                                            2
cooking_clean_fuel_and_technologies_access     6
renewable_energy_consumption                   3
co2_emissions                                  4
methane_emissions                              4
nitrous_oxide_emissions                        4
dtype: int64

Remove countries with more than 50% columns missing in crucial variables for the index, which are columns that relates to the environmental aspect

In [1414]:
environmental_related_columns = [
    "cooking_clean_fuel_and_technologies_access", 
    "renewable_energy_consumption", 
    "co2_emissions", 
    "methane_emissions", 
    "nitrous_oxide_emissions",
    "freshwater",
    "forest_area"
]

threshold = 4

missing_counts = lcdci_df[environmental_related_columns].isnull().sum(axis=1)

lcdci_df_filtered = lcdci_df[missing_counts <= threshold] 

dropped_countries = lcdci_df.loc[missing_counts > threshold, 'country'] 
print(f"Dropped countries due to excessive missing data: {list(dropped_countries)}")

lcdci_df_filtered = lcdci_df_filtered.reset_index(drop=True)

lcdci_df_filtered

Dropped countries due to excessive missing data: ['Monaco', 'Nauru', 'San Marino']


Unnamed: 0,country,forest_area,freshwater,gdp_per_capita,paris_agreement,unfccc,hdi,cooking_clean_fuel_and_technologies_access,renewable_energy_consumption,co2_emissions,methane_emissions,nitrous_oxide_emissions
0,Afghanistan,1208.44,47.2,355.777826,1,2002,0.462,35.4,17.6,8709.470000,16222.035960,4863.386801
1,Albania,788.90,26.9,6377.203096,1,1994,0.789,83.7,44.6,4383.200000,2692.195886,1015.186729
2,Algeria,1949.00,11.2,3700.314697,1,1993,0.745,99.7,0.2,161563.000000,86543.923620,12578.747600
3,Andorra,16.00,0.3,42072.319423,1,2011,0.884,100.0,21.9,448.884399,53.600959,1.758811
4,Angola,66607.38,148.0,1927.474078,1,2000,0.591,50.0,61.0,19814.500000,32703.238110,17757.867840
...,...,...,...,...,...,...,...,...,...,...,...,...
186,Venezuela (Bolivarian Republic of),46230.90,805.0,3967.000000,1,1994,0.699,95.5,23.3,72509.000000,97594.234880,13097.219340
187,Viet Nam,14643.09,359.4,3756.488901,1,1994,0.726,96.1,19.1,355323.100000,79619.238030,26832.534680
188,Yemen,549.00,2.1,543.637538,1,1996,0.424,61.3,3.5,9960.100000,10542.153760,3364.858401
189,Zambia,44814.03,80.2,1134.713454,1,1993,0.569,10.2,81.8,7607.100000,15448.752330,12363.494640


For the imputation of missing values, I decided to use the MICE algorithm due to it being a robust, informative method of dealing with missing data in datasets.

In [1415]:
from fancyimpute import IterativeImputer 

In [1416]:
mice_imputer = IterativeImputer() 
lcdci_df_imputed = mice_imputer.fit_transform(lcdci_df_filtered.iloc[:, 1:])
lcdci_df_imputed = pd.DataFrame(lcdci_df_imputed, columns=lcdci_df_filtered.columns[1:])
lcdci_df_imputed.insert(0, "country", lcdci_df_filtered["country"])
lcdci_df_imputed

Unnamed: 0,country,forest_area,freshwater,gdp_per_capita,paris_agreement,unfccc,hdi,cooking_clean_fuel_and_technologies_access,renewable_energy_consumption,co2_emissions,methane_emissions,nitrous_oxide_emissions
0,Afghanistan,1208.44,47.2,355.777826,1.0,2002.0,0.462,35.4,17.6,8709.470000,16222.035960,4863.386801
1,Albania,788.90,26.9,6377.203096,1.0,1994.0,0.789,83.7,44.6,4383.200000,2692.195886,1015.186729
2,Algeria,1949.00,11.2,3700.314697,1.0,1993.0,0.745,99.7,0.2,161563.000000,86543.923620,12578.747600
3,Andorra,16.00,0.3,42072.319423,1.0,2011.0,0.884,100.0,21.9,448.884399,53.600959,1.758811
4,Angola,66607.38,148.0,1927.474078,1.0,2000.0,0.591,50.0,61.0,19814.500000,32703.238110,17757.867840
...,...,...,...,...,...,...,...,...,...,...,...,...
186,Venezuela (Bolivarian Republic of),46230.90,805.0,3967.000000,1.0,1994.0,0.699,95.5,23.3,72509.000000,97594.234880,13097.219340
187,Viet Nam,14643.09,359.4,3756.488901,1.0,1994.0,0.726,96.1,19.1,355323.100000,79619.238030,26832.534680
188,Yemen,549.00,2.1,543.637538,1.0,1996.0,0.424,61.3,3.5,9960.100000,10542.153760,3364.858401
189,Zambia,44814.03,80.2,1134.713454,1.0,1993.0,0.569,10.2,81.8,7607.100000,15448.752330,12363.494640


In [1417]:
differences = lcdci_df_filtered.compare(lcdci_df_imputed)
differences['country'] = lcdci_df_filtered['country']

differences

Unnamed: 0_level_0,freshwater,freshwater,hdi,hdi,cooking_clean_fuel_and_technologies_access,cooking_clean_fuel_and_technologies_access,co2_emissions,co2_emissions,methane_emissions,methane_emissions,nitrous_oxide_emissions,nitrous_oxide_emissions,country
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,self,other,Unnamed: 13_level_1
25,,,,,,73.378171,,,,,,,Bulgaria
45,,,,0.572501,,,,,,,,,Democratic People's Republic of Korea
89,,15.233749,,,,,,,,,,,Kiribati
94,,,,,,79.966644,,,,,,,Lebanon
97,,,,,,82.951056,,,,,,,Libya
98,,36.495599,,,,161.441481,,,,,,,Liechtenstein
107,,15.710683,,,,,,,,,,,Marshall Islands
111,,16.016091,,,,,,,,,,,Micronesia (Federated States of)
113,,21.717717,,,,,,,,,,,Montenegro
128,,16.648813,,,,,,,,,,,Palau


In [1418]:
lcdci_df_imputed.isnull().sum()

country                                       0
forest_area                                   0
freshwater                                    0
gdp_per_capita                                0
paris_agreement                               0
unfccc                                        0
hdi                                           0
cooking_clean_fuel_and_technologies_access    0
renewable_energy_consumption                  0
co2_emissions                                 0
methane_emissions                             0
nitrous_oxide_emissions                       0
dtype: int64