# Data Cleaning

## 1. Loading the Data

In [121]:
import pandas as pd

In [122]:
forest_area = pd.read_csv("dataset/Forest Area.csv")
freshwater = pd.read_csv("dataset/Freshwater.csv", skiprows=3)
gdp_per_capita = pd.read_csv("dataset/gdp_per_capita.csv")
governance = pd.read_csv("dataset/Governance.csv")
human_development_index = pd.read_csv("dataset/Human Development Index.csv")
sustainable_energy = pd.read_csv("dataset/Sustainable energy for all.csv", skiprows=3)
emissions = pd.read_csv("dataset/trends_in_greenhouse_gas_emissions.csv", skiprows=3)

## 2. Choosing Relevant Variables

### Forest Area - 2020 Forest Area

In [123]:
# Forest area
forest_area_filtered = forest_area[['Country and Area', 'Forest Area, 2020 (1000 ha)']]
forest_area_filtered = forest_area_filtered.rename(columns={'Country and Area': 'Country'})
forest_area_filtered.head()

Unnamed: 0,Country,"Forest Area, 2020 (1000 ha)"
0,WORLD,4058930.81
1,Afghanistan,1208.44
2,Albania,788.9
3,Algeria,1949.0
4,American Samoa,17.13


### Freshwater

In [124]:
freshwater_filtered = freshwater[['Country', 'Internal renewable freshwater resources flows 2020']]
freshwater_filtered.head()

Unnamed: 0,Country,Internal renewable freshwater resources flows 2020
0,Afghanistan,47.2
1,Albania,26.9
2,Algeria,11.2
3,American Samoa,..
4,Andorra,0.3


### GDP per Capita

In [125]:
gdp_per_capita_filtered = gdp_per_capita[['Country Name', '2020']]
gdp_per_capita_filtered = gdp_per_capita_filtered.rename(columns={'Country Name': 'Country'})
gdp_per_capita_filtered.head()

Unnamed: 0,Country,2020
0,Aruba,
1,Africa Eastern and Southern,1359.618224
2,Afghanistan,516.747871
3,Africa Western and Central,1710.073363
4,Angola,1776.166868


### Governance

In [126]:
governance_filtered = governance[['Country and area', 'Paris Agreement', 'UN Framework Convention on Climate Change']]
governance_filtered = governance_filtered.rename(columns={'Country and area': 'Country'})
governance_filtered.head()

Unnamed: 0,Country,Paris Agreement,UN Framework Convention on Climate Change
0,Afghanistan,2017,2002
1,Albania,2016,1994
2,Algeria,2016,1993
3,Andorra,2017,2011
4,Angola,...,2000


### Human Development Index

In [127]:
human_development_index_filtered = human_development_index[['Country', 'Human Development Index (2021)']]
human_development_index_filtered.head()

Unnamed: 0,Country,Human Development Index (2021)
0,Afghanistan,0.478
1,Angola,0.586
2,Albania,0.796
3,Andorra,0.858
4,United Arab Emirates,0.911


### Sustainable Energy

In [128]:
sustainable_energy_filtered = sustainable_energy[['Country', 'Access to clean fuels and technologies for cooking 2021', 'Renewable energy consumption 2020']]
sustainable_energy_filtered.head()

Unnamed: 0,Country,Access to clean fuels and technologies for cooking 2021,Renewable energy consumption 2020
0,Afghanistan,35.4,17.6
1,Albania,83.7,44.6
2,Algeria,99.7,0.2
3,American Samoa,..,0.5
4,Andorra,100,21.9


### Emissions

In [129]:
emissions_filtered = emissions[['Country', 'Carbon dioxide emissions 2020', 'Methane emissions 2020', 'Nitrous oxide emissions 2020']]
emissions_filtered.head()

Unnamed: 0,Country,Carbon dioxide emissions 2020,Methane emissions 2020,Nitrous oxide emissions 2020
0,Afghanistan,8709.47,16222.03596,4863.386801
1,Albania,4383.2,2692.195886,1015.186729
2,Algeria,161563,86543.92362,12578.7476
3,American Samoa,..,..,..
4,Andorra,448.884399,53.600959,1.758811


## 3. Merge Datasets

Remove unnecessary rows after last country (Zimbabwe) for Freshwater, Sustainable Energy, and Emissions dataset

In [130]:
freshwater_filtered = freshwater_filtered.loc[:freshwater_filtered[freshwater_filtered['Country'] == 'Zimbabwe'].index[0]]
freshwater_filtered

Unnamed: 0,Country,Internal renewable freshwater resources flows 2020
0,Afghanistan,47.2
1,Albania,26.9
2,Algeria,11.2
3,American Samoa,..
4,Andorra,0.3
...,...,...
209,Virgin Islands (U.S.),..
210,West Bank and Gaza,0.8
211,"Yemen, Rep.",2.1
212,Zambia,80.2


In [131]:
sustainable_energy_filtered = sustainable_energy_filtered.loc[:sustainable_energy_filtered[sustainable_energy_filtered['Country'] == 'Zimbabwe'].index[0]]
sustainable_energy_filtered

Unnamed: 0,Country,Access to clean fuels and technologies for cooking 2021,Renewable energy consumption 2020
0,Afghanistan,35.4,17.6
1,Albania,83.7,44.6
2,Algeria,99.7,0.2
3,American Samoa,..,0.5
4,Andorra,100,21.9
...,...,...,...
209,Virgin Islands (U.S.),..,5.1
210,West Bank and Gaza,..,15
211,"Yemen, Rep.",61.3,3.5
212,Zambia,10.2,81.8


In [132]:
emissions_filtered = emissions_filtered.loc[:emissions_filtered[emissions_filtered['Country'] == 'Zimbabwe'].index[0]]
emissions_filtered

Unnamed: 0,Country,Carbon dioxide emissions 2020,Methane emissions 2020,Nitrous oxide emissions 2020
0,Afghanistan,8709.47,16222.03596,4863.386801
1,Albania,4383.2,2692.195886,1015.186729
2,Algeria,161563,86543.92362,12578.7476
3,American Samoa,..,..,..
4,Andorra,448.884399,53.600959,1.758811
...,...,...,...,...
209,Virgin Islands (U.S.),..,..,..
210,West Bank and Gaza,..,..,..
211,"Yemen, Rep.",9960.1,10542.15376,3364.858401
212,Zambia,7607.1,15448.75233,12363.49464


Get standardised country names to merge data into one CSV file as each datasets might have different conventions on naming the countries.

In [133]:
country_name_mapping = {
    "Bahamas, The": "Bahamas",
    "Bolivia": "Bolivia (Plurinational State of)",
    "Congo, Dem. Rep.": "Democratic Republic of the Congo",
    "Congo, Democratic Republic of the": "Democratic Republic of the Congo",
    "Congo, Rep.": "Congo",
    "Congo, Republic of": "Congo",
    "Cote d'Ivoire": "Côte d’Ivoire",
    "Gambia, The": "Gambia",
    "Korea, Rep.": "Republic of Korea",
    "Korea, Republic of": "Republic of Korea",
    "South Korea": "Republic of Korea",
    "Korea, Dem. People's Rep.": "Democratic People's Republic of Korea",
    "North Korea": "Democratic People's Republic of Korea",
    "Iran, Islamic Rep.": "Iran (Islamic Republic of)",
    "Iran": "Iran (Islamic Republic of)",
    "Lao PDR": "Lao People's Democratic Republic",
    "Lao": "Lao People's Democratic Republic",
    "Micronesia, Fed. Sts.": "Micronesia (Federated States of)",
    "Micronesia": "Micronesia (Federated States of)",
    "Republic of Moldova": "Moldova",
    "St. Kitts and Nevis": "Saint Kitts and Nevis",
    "St. Lucia": "Saint Lucia",
    "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines",
    "Turkiye": "Turkey",
    "Tanzania": "United Republic of Tanzania",
    "Venezuela, RB": "Venezuela",
    "Yemen, Rep.": "Yemen",
    "Syrian Arab Republic": "Syria",
    "Egypt, Arab Rep.": "Egypt",
    "Kyrgyz Republic": "Kyrgyzstan",
    "Slovak Republic": "Slovakia",
    "Russian Federation": "Russia",
    "Gambia, The": "The Gambia",
    "United Kingdom": "United Kingdom of Great Britain and Northern Ireland",
    "United States": "United States of America",
    "Vietnam": "Viet Nam",
    "Venezuela": "Venezuela (Bolivarian Republic of)",
    "Venezuela, RB": "Venezuela (Bolivarian Republic of)"
}

In [134]:
print("Forest Area: ", len(forest_area_filtered))
print("Freshwater: ", len(freshwater_filtered))
print("GDP per Capita: ", len(gdp_per_capita_filtered))
print("Governance: ", len(governance_filtered))
print("HDI: ", len(human_development_index_filtered))
print("Sustainable Energy: ", len(sustainable_energy_filtered))
print("Emissions: ", len(emissions_filtered))

Forest Area:  237
Freshwater:  214
GDP per Capita:  266
Governance:  194
HDI:  195
Sustainable Energy:  214
Emissions:  214


As the Governance dataset has the least amount of countries, I'm going to use countries from the Governance dataset as the base country list for the merged data.

In [135]:
countries_list = governance_filtered['Country'].unique()
countries_list

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria',
       'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo', 'Costa Rica', 'Côte d’Ivoire',
       'Croatia', 'Cuba', 'Cyprus', 'Czechia',
       "Democratic People's Republic of Korea",
       'Democratic Republic of the Congo', 'Denmark', 'Djibouti',
       'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon',
       'Gambia', 'Georgia', 'G

Map the country names from other datasets to match the country name in the Governance dataset.

In [136]:
forest_area_filtered.loc[:, 'Country'] = forest_area_filtered['Country'].replace(country_name_mapping)
forest_area_filtered['Country'].unique()

array(['WORLD', 'Afghanistan', 'Albania', 'Algeria', 'American Samoa',
       'Andorra', 'Angola', 'Anguilla', 'Antigua and Barbuda',
       'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria',
       'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados',
       'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan',
       'Bolivia (Plurinational State of)',
       'Bonaire, Sint Eustatius and Saba', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'British Virgin Islands',
       'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi',
       'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Cayman Islands',
       'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia',
       'Comoros', 'Congo', 'Cook Islands', 'Costa Rica', 'Côte d’Ivoire',
       'Croatia', 'Cuba', 'Curaçao', 'Cyprus', 'Czechia',
       "Democratic People's Republic of Korea",
       'Democratic Republic of the Congo', 'Denmark', 'Djibouti',
       'Dominica', 'Dominican Republic

In [137]:
gdp_per_capita_filtered.loc[:, 'Country'] = gdp_per_capita_filtered['Country'].replace(country_name_mapping)
gdp_per_capita_filtered['Country'].unique()

array(['Aruba', 'Africa Eastern and Southern', 'Afghanistan',
       'Africa Western and Central', 'Angola', 'Albania', 'Andorra',
       'Arab World', 'United Arab Emirates', 'Argentina', 'Armenia',
       'American Samoa', 'Antigua and Barbuda', 'Australia', 'Austria',
       'Azerbaijan', 'Burundi', 'Belgium', 'Benin', 'Burkina Faso',
       'Bangladesh', 'Bulgaria', 'Bahrain', 'Bahamas',
       'Bosnia and Herzegovina', 'Belarus', 'Belize', 'Bermuda',
       'Bolivia (Plurinational State of)', 'Brazil', 'Barbados',
       'Brunei Darussalam', 'Bhutan', 'Botswana',
       'Central African Republic', 'Canada',
       'Central Europe and the Baltics', 'Switzerland', 'Channel Islands',
       'Chile', 'China', 'Côte d’Ivoire', 'Cameroon',
       'Democratic Republic of the Congo', 'Congo', 'Colombia', 'Comoros',
       'Cabo Verde', 'Costa Rica', 'Caribbean small states', 'Cuba',
       'Curacao', 'Cayman Islands', 'Cyprus', 'Czech Republic', 'Germany',
       'Djibouti', 'Dominica', '

In [138]:
human_development_index_filtered.loc[:, 'Country'] = human_development_index_filtered['Country'].replace(country_name_mapping)
human_development_index_filtered['Country'].unique()

array(['Afghanistan', 'Angola', 'Albania', 'Andorra',
       'United Arab Emirates', 'Argentina', 'Armenia',
       'Antigua and Barbuda', 'Australia', 'Austria', 'Azerbaijan',
       'Burundi', 'Belgium', 'Benin', 'Burkina Faso', 'Bangladesh',
       'Bulgaria', 'Bahrain', 'Bahamas', 'Bosnia and Herzegovina',
       'Belarus', 'Belize', 'Bolivia (Plurinational State of)', 'Brazil',
       'Barbados', 'Brunei', 'Bhutan', 'Botswana',
       'Central African Republic', 'Canada', 'Switzerland', 'Chile',
       'China', 'Ivory Coast', 'Cameroon',
       'The Democratic Republic of the Congo', 'Congo', 'Colombia',
       'Comoros', 'Cabo Verde', 'Costa Rica', 'Cuba', 'Cyprus', 'Czechia',
       'Germany', 'Djibouti', 'Dominica', 'Denmark', 'Dominican Republic',
       'Algeria', 'Ecuador', 'Egypt', 'Eritrea', 'Spain', 'Estonia',
       'Ethiopia', 'Finland', 'Fiji', 'France',
       'Micronesia (Federated States of)', 'Gabon',
       'United Kingdom of Great Britain and Northern Ireland', '

In [139]:
sustainable_energy_filtered.loc[:, 'Country'] = sustainable_energy_filtered['Country'].replace(country_name_mapping)
sustainable_energy_filtered['Country'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra',
       'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bermuda', 'Bhutan', 'Bolivia (Plurinational State of)',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi',
       'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Cayman Islands',
       'Central African Republic', 'Chad', 'Channel Islands', 'Chile',
       'China', 'Hong Kong SAR, China', 'Macao SAR, China', 'Colombia',
       'Comoros', 'Democratic Republic of the Congo', 'Congo',
       'Costa Rica', 'Côte d’Ivoire', 'Croatia', 'Cuba', 'Curacao',
       'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Es

In [140]:
emissions_filtered.loc[:, 'Country'] = emissions_filtered['Country'].replace(country_name_mapping)
emissions_filtered['Country'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra',
       'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bermuda', 'Bhutan', 'Bolivia (Plurinational State of)',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi',
       'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Cayman Islands',
       'Central African Republic', 'Chad', 'Channel Islands', 'Chile',
       'China', 'Hong Kong SAR, China', 'Macao SAR, China', 'Colombia',
       'Comoros', 'Democratic Republic of the Congo', 'Congo',
       'Costa Rica', 'Côte d’Ivoire', 'Croatia', 'Cuba', 'Curacao',
       'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Et