# Step 1: Data Wrangling

## Part 1
This step focuses on collecting data, organizing it, and making sure it's well defined.

Some data cleaning can be done at this stage, but it's important not to be overzealous in the cleaning before I've explored the data to better understand it.

In [1]:
#Importing all necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

import seaborn as sns

from sqlalchemy import create_engine

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
#Assigning Working Directories Path
os.chdir("C:\\Users\\moham\\OneDrive\\Desktop\\Other\\Bootcamp\\Capstone_2\\ProjectIdeas_Initial\\Raw\\")

# Data Loading

In [4]:
#Data are sourced from the World Bank Open Data (https://www.worldbank.org/en/programs/africa-climate-business-plan)  
#Humanatarian Data Exchange (https://data.humdata.org) and Meteo Blue (https://meteoblue.com). 

afr_regional_energy_stat = pd.read_csv('african-regional-energy-statistics-2014_v2.csv')
climate_change_kenya = pd.read_csv('climate-change_ken.csv')
nrj_mining_kenya = pd.read_csv('energy-and-mining_kenya.csv')
global_temp_country = pd.read_csv('GlobalLandTemperaturesByCountry.csv')
continents = pd.read_csv('continents_country.csv')
nairobi = pd.read_csv('Nairobi_Raw_noheaders-v2.csv')
cape_town = pd.read_csv('CapeTown_Raw_noheaders-v2.csv')
algiers = pd.read_csv('Algiers_Raw_noheaders-v2.csv')

In [5]:
#Early Analysis

afr_regional_energy_stat.head()
climate_change_kenya.head()
nrj_mining_kenya.head()
global_temp_country.head()
continents.head()
nairobi.head()
cape_town.head()
algiers.head()

Unnamed: 0,Indicator,IndicatorName,Region,RegionName,Unit,Date,Value
0,102,"Final Consumption of coking coal, 1000 tonnes",10203,Ethiopia,thousand tonnes,2006,11.0
1,102,"Final Consumption of coking coal, 1000 tonnes",10203,Ethiopia,thousand tonnes,2007,6.0
2,102,"Final Consumption of coking coal, 1000 tonnes",10203,Ethiopia,thousand tonnes,2008,15.0
3,102,"Final Consumption of coking coal, 1000 tonnes",10203,Ethiopia,thousand tonnes,2009,25.0
4,102,"Final Consumption of coking coal, 1000 tonnes",10203,Ethiopia,thousand tonnes,2010,50.0


Unnamed: 0,Country Name,Country ISO3,Year,Indicator Name,Indicator Code,Value
0,#country+name,#country+code,#date+year,#indicator+name,#indicator+code,#indicator+value+num
1,Kenya,KEN,2018,Agricultural land (sq. km),AG.LND.AGRI.K2,276300
2,Kenya,KEN,2017,Agricultural land (sq. km),AG.LND.AGRI.K2,276300
3,Kenya,KEN,2016,Agricultural land (sq. km),AG.LND.AGRI.K2,276300
4,Kenya,KEN,2015,Agricultural land (sq. km),AG.LND.AGRI.K2,276300


Unnamed: 0,Country Name,Country ISO3,Year,Indicator Name,Indicator Code,Value
0,#country+name,#country+code,#date+year,#indicator+name,#indicator+code,#indicator+value+num
1,Kenya,KEN,2015,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.845690551
2,Kenya,KEN,2014,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.825663741
3,Kenya,KEN,2013,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.439852612
4,Kenya,KEN,2012,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.434618378


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
1,1743-12-01,,,Åland
2,1744-01-01,,,Åland
3,1744-02-01,,,Åland
4,1744-03-01,,,Åland


Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


Unnamed: 0,timestamp,Nairobi Temperature [2 m elevation corrected],Nairobi Precipitation Total,Nairobi Snowfall Amount,Nairobi Relative Humidity [2 m],Nairobi Wind Speed [10 m],Nairobi Wind Direction Dominant [10 m],Nairobi Cloud Cover Total,Nairobi Sunshine Duration,Nairobi Shortwave Radiation,Nairobi Mean Sea Level Pressure [MSL],Nairobi Soil Temperature [0-10 cm down],Nairobi Soil Moisture [0-10 cm down]
0,19850901T0000,69.62904,0.0,0,84,10.398747,112.53505,93.291664,95.106384,3090.08,1017.5,74.138,0.172
1,19850902T0000,71.60905,0.0,0,86,11.937781,110.25738,86.625,188.29787,3262.7397,1017.8,75.27202,0.172
2,19850903T0000,71.64504,0.0,0,88,10.664815,108.77132,96.625,47.872337,3485.24,1018.3,76.26199,0.193
3,19850904T0000,70.79904,0.0,0,85,8.518,107.01516,94.166664,81.702126,2620.1597,1018.6,73.52602,0.202
4,19850905T0000,73.30104,0.0,0,86,10.187275,106.76452,68.375,426.11063,4981.33,1017.1,78.72798,0.202


Unnamed: 0,timestamp,Cape Town Temperature [2 m elevation corrected],Cape Town Precipitation Total,Cape Town Snowfall Amount,Cape Town Relative Humidity [2 m],Cape Town Wind Speed [10 m],Cape Town Wind Direction Dominant [10 m],Cape Town Cloud Cover Total,Cape Town Sunshine Duration,Cape Town Shortwave Radiation,Cape Town Mean Sea Level Pressure [MSL],Cape Town Soil Temperature [0-10 cm down],Cape Town Soil Moisture [0-10 cm down]
0,19850901T0000,83.85659,0.0,0,99,4.457064,292.47943,15.708333,675.83484,5135.3003,1017.0,75.56002,0.145
1,19850902T0000,67.00859,0.0,0,96,7.161691,288.88968,93.0,95.744675,4807.7803,1015.1,67.046005,0.141
2,19850903T0000,63.67859,0.0,0,92,8.108701,305.56384,65.666664,105.773056,2403.0,1020.8,62.636032,0.137
3,19850904T0000,69.58259,0.0,0,90,6.18706,245.22487,52.82083,379.96277,5029.3896,1021.2,67.36999,0.14
4,19850905T0000,71.58059,0.0,0,100,7.644957,262.43716,49.208332,423.07584,5007.1406,1023.0,69.94403,0.14


Unnamed: 0,timestamp,Algiers Temperature [2 m elevation corrected],Algiers Precipitation Total,Algiers Snowfall Amount,Algiers Relative Humidity [2 m],Algiers Wind Speed [10 m],Algiers Wind Direction Dominant [10 m],Algiers Cloud Cover Total,Algiers Sunshine Duration,Algiers Shortwave Radiation,Algiers Mean Sea Level Pressure [MSL],Algiers Soil Temperature [0-10 cm down],Algiers Soil Moisture [0-10 cm down]
0,19850901T0000,98.53823,0.0,0,34,11.739121,93.81408,2.666667,738.549,6974.929,1017.6,97.79003,0.112
1,19850902T0000,102.46223,0.0,0,46,10.774512,88.51213,3.208333,728.0011,6875.2505,1016.5,101.246,0.112
2,19850903T0000,99.09622,0.0,0,71,8.408619,272.03503,11.666667,652.29895,6763.1094,1019.5,99.661995,0.112
3,19850904T0000,87.99023,0.0,0,96,15.634569,62.35402,22.833334,511.31512,6613.591,1021.8,92.17398,0.112
4,19850905T0000,89.79022,0.0,0,95,16.150927,74.99781,13.333333,711.33905,6590.45,1021.6,92.96599,0.113


In [6]:
#Renaming them for Simplicity (will later be renamed proper name when saved)

afr =afr_regional_energy_stat
clim = climate_change_kenya
nrj = nrj_mining_kenya
glob = global_temp_country
cont = continents


In [7]:
#Exploring the general information about the data

afr.info()

clim.info()

nrj.info()

glob.info()

cont.info()

nairobi.info()

cape_town.info()

algiers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17018 entries, 0 to 17017
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Indicator      17018 non-null  int64  
 1   IndicatorName  17018 non-null  object 
 2   Region         17018 non-null  int64  
 3   RegionName     17018 non-null  object 
 4   Unit           17018 non-null  object 
 5   Date           17018 non-null  int64  
 6   Value          17018 non-null  float64
dtypes: float64(1), int64(3), object(3)
memory usage: 930.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1953 entries, 0 to 1952
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Country Name    1953 non-null   object
 1   Country ISO3    1953 non-null   object
 2   Year            1953 non-null   object
 3   Indicator Name  1953 non-null   object
 4   Indicator Code  1953 non-null   object
 5   Value        

#### Checking for Missing values and counting them

In [8]:
missing_afr = pd.concat([afr.isnull().sum(), 100 * afr.isnull().mean()], axis=1)
missing_afr.columns=['count','%']
missing_afr.sort_values(by=['count'])

Unnamed: 0,count,%
Indicator,0,0.0
IndicatorName,0,0.0
Region,0,0.0
RegionName,0,0.0
Unit,0,0.0
Date,0,0.0
Value,0,0.0


In [9]:
missing_clim = pd.concat([clim.isnull().sum(), 100 * clim.isnull().mean()], axis=1)
missing_clim.columns=['count','%']
missing_clim.sort_values(by=['count'])

Unnamed: 0,count,%
Country Name,0,0.0
Country ISO3,0,0.0
Year,0,0.0
Indicator Name,0,0.0
Indicator Code,0,0.0
Value,0,0.0


In [10]:
missing_nrj = pd.concat([nrj.isnull().sum(), 100 * nrj.isnull().mean()], axis=1)
missing_nrj.columns=['count','%']
missing_nrj.sort_values(by=['count'])

Unnamed: 0,count,%
Country Name,0,0.0
Country ISO3,0,0.0
Year,0,0.0
Indicator Name,0,0.0
Indicator Code,0,0.0
Value,0,0.0


In [11]:
missing_glob = pd.concat([glob.isnull().sum(), 100 * glob.isnull().mean()], axis=1)
missing_glob.columns=['count','%']
missing_glob.sort_values(by=['count'])

Unnamed: 0,count,%
dt,0,0.0
Country,0,0.0
AverageTemperatureUncertainty,31912,5.526251
AverageTemperature,32651,5.654225


### The "Glob" table is missing values for "AverageTemperatureUncertainty" and "Average Temperature"

In [12]:
#Dropping non-values in the "glob" table and renaming it "glob_edit"
glob_edit = glob.dropna()
missing_glob_edit = pd.concat([glob_edit.isnull().sum(), 100 * glob_edit.isnull().mean()], axis=1)
missing_glob_edit.columns=['count','%']
missing_glob_edit.sort_values(by=['count'])

Unnamed: 0,count,%
dt,0,0.0
AverageTemperature,0,0.0
AverageTemperatureUncertainty,0,0.0
Country,0,0.0


In [13]:
missing_cont = pd.concat([cont.isnull().sum(), 100 * cont.isnull().mean()], axis=1)
missing_cont.columns=['count','%']
missing_cont.sort_values(by=['count'])

Unnamed: 0,count,%
name,0,0.0
alpha-3,0,0.0
country-code,0,0.0
iso_3166-2,0,0.0
alpha-2,1,0.401606
region,1,0.401606
sub-region,1,0.401606
region-code,1,0.401606
sub-region-code,1,0.401606
intermediate-region,142,57.028112


#### For our study, we are only using the continents data to link the countries in the Global table to their respective continents. So, the region, alpha-3. country-code, iso_3166-2, alpha-2 codes are not necessary to keep.



In [14]:
#Deleting unecessary columns from the cont table

cont.drop(['alpha-3', 'country-code', 'iso_3166-2', 'alpha-2', 'region-code', 'sub-region-code', 'intermediate-region','intermediate-region-code'], axis=1, inplace=True);
cont.head()

Unnamed: 0,name,region,sub-region
0,Afghanistan,Asia,Southern Asia
1,Åland Islands,Europe,Northern Europe
2,Albania,Europe,Southern Europe
3,Algeria,Africa,Northern Africa
4,American Samoa,Oceania,Polynesia


In [15]:
# Rechecking missing values after columns drop

missing_cont = pd.concat([cont.isnull().sum(), 100 * cont.isnull().mean()], axis=1)
missing_cont.columns=['count','%']
missing_cont.sort_values(by=['count'])

Unnamed: 0,count,%
name,0,0.0
region,1,0.401606
sub-region,1,0.401606


In [16]:
cont['region'].unique();
cont['sub-region'].unique()

array(['Asia', 'Europe', 'Africa', 'Oceania', 'Americas', nan],
      dtype=object)

array(['Southern Asia', 'Northern Europe', 'Southern Europe',
       'Northern Africa', 'Polynesia', 'Sub-Saharan Africa',
       'Latin America and the Caribbean', nan, 'Western Asia',
       'Australia and New Zealand', 'Western Europe', 'Eastern Europe',
       'Northern America', 'South-eastern Asia', 'Eastern Asia',
       'Melanesia', 'Micronesia', 'Central Asia'], dtype=object)

In [17]:
#Dropping NaN values
cont_edit = cont.dropna();

In [18]:
# Rechecking missing values after dropping nan values

missing_cont_edit = pd.concat([cont_edit.isnull().sum(), 100 * cont_edit.isnull().mean()], axis=1);
missing_cont_edit.columns=['count','%']
missing_cont_edit.sort_values(by=['count'])

Unnamed: 0,count,%
name,0,0.0
region,0,0.0
sub-region,0,0.0


In [19]:
cont_edit['region'].unique()
cont_edit['sub-region'].unique()

array(['Asia', 'Europe', 'Africa', 'Oceania', 'Americas'], dtype=object)

array(['Southern Asia', 'Northern Europe', 'Southern Europe',
       'Northern Africa', 'Polynesia', 'Sub-Saharan Africa',
       'Latin America and the Caribbean', 'Western Asia',
       'Australia and New Zealand', 'Western Europe', 'Eastern Europe',
       'Northern America', 'South-eastern Asia', 'Eastern Asia',
       'Melanesia', 'Micronesia', 'Central Asia'], dtype=object)

In [20]:
cont_edit.head();cont_edit.shape

Unnamed: 0,name,region,sub-region
0,Afghanistan,Asia,Southern Asia
1,Åland Islands,Europe,Northern Europe
2,Albania,Europe,Southern Europe
3,Algeria,Africa,Northern Africa
4,American Samoa,Oceania,Polynesia


(248, 3)

In [21]:
cont_africa = cont[cont['region']=='Africa'];cont_africa.head();cont.shape;
cont_africa['name'].unique()

Unnamed: 0,name,region,sub-region
3,Algeria,Africa,Northern Africa
6,Angola,Africa,Sub-Saharan Africa
23,Benin,Africa,Sub-Saharan Africa
29,Botswana,Africa,Sub-Saharan Africa
32,British Indian Ocean Territory,Africa,Sub-Saharan Africa


(249, 3)

array(['Algeria', 'Angola', 'Benin', 'Botswana',
       'British Indian Ocean Territory', 'Burkina Faso', 'Burundi',
       'Cabo Verde', 'Cameroon', 'Central African Republic', 'Chad',
       'Comoros', 'Congo', 'Congo (Democratic Republic Of The)',
       "Côte D'Ivoire", 'Djibouti', 'Egypt', 'Equatorial Guinea',
       'Eritrea', 'Eswatini', 'Ethiopia', 'French Southern Territories',
       'Gabon', 'Gambia', 'Ghana', 'Guinea', 'Guinea Bissau', 'Kenya',
       'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi', 'Mali',
       'Mauritania', 'Mauritius', 'Mayotte', 'Morocco', 'Mozambique',
       'Namibia', 'Niger', 'Nigeria', 'Réunion', 'Rwanda',
       'Saint Helena, Ascension and Tristan da Cunha',
       'Sao Tome and Principe', 'Senegal', 'Seychelles', 'Sierra Leone',
       'Somalia', 'South Africa', 'South Sudan', 'Sudan', 'Tanzania',
       'Togo', 'Tunisia', 'Uganda', 'Western Sahara', 'Zambia',
       'Zimbabwe'], dtype=object)

In [22]:
#Congo is Duplicated.

cont_africa[cont_africa['name'] == 'Congo']; cont_africa[cont_africa['name'] == 'Congo (Democratic Republic Of The)']

Unnamed: 0,name,region,sub-region
50,Congo,Africa,Sub-Saharan Africa


Unnamed: 0,name,region,sub-region
51,Congo (Democratic Republic Of The),Africa,Sub-Saharan Africa


In [23]:
#Congo is Duplicated, needs to be removed.

Delete_dupl = ['Congo (Democratic Republic Of The)'];
cont_africa[cont_africa.name.isin(Delete_dupl)]

Unnamed: 0,name,region,sub-region
51,Congo (Democratic Republic Of The),Africa,Sub-Saharan Africa


In [24]:
cont_africa_edit = cont_africa[~cont_africa.name.isin(Delete_dupl)]

In [25]:
cont_africa_edit['name'].unique()

array(['Algeria', 'Angola', 'Benin', 'Botswana',
       'British Indian Ocean Territory', 'Burkina Faso', 'Burundi',
       'Cabo Verde', 'Cameroon', 'Central African Republic', 'Chad',
       'Comoros', 'Congo', "Côte D'Ivoire", 'Djibouti', 'Egypt',
       'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia',
       'French Southern Territories', 'Gabon', 'Gambia', 'Ghana',
       'Guinea', 'Guinea Bissau', 'Kenya', 'Lesotho', 'Liberia', 'Libya',
       'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mauritius',
       'Mayotte', 'Morocco', 'Mozambique', 'Namibia', 'Niger', 'Nigeria',
       'Réunion', 'Rwanda',
       'Saint Helena, Ascension and Tristan da Cunha',
       'Sao Tome and Principe', 'Senegal', 'Seychelles', 'Sierra Leone',
       'Somalia', 'South Africa', 'South Sudan', 'Sudan', 'Tanzania',
       'Togo', 'Tunisia', 'Uganda', 'Western Sahara', 'Zambia',
       'Zimbabwe'], dtype=object)

In [28]:
# Checking the Nairobi Dataset Info
nairobi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13150 entries, 0 to 13149
Data columns (total 25 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   timestamp                                      13150 non-null  object 
 1   Nairobi Temperature [2 m elevation corrected]  13150 non-null  float64
 2   Nairobi Precipitation Total                    13150 non-null  float64
 3   Nairobi Snowfall Amount                        13150 non-null  int64  
 4   Nairobi Relative Humidity [2 m]                13150 non-null  int64  
 5   Nairobi Wind Speed [10 m]                      13150 non-null  float64
 6   Nairobi Wind Direction Dominant [10 m]         13150 non-null  float64
 7   Nairobi Cloud Cover Total                      13150 non-null  float64
 8   Nairobi Sunshine Duration                      13150 non-null  float64
 9   Nairobi Shortwave Radiation                    131

In [29]:
#Renaming the columns to a shorter name
nairobi.info()
nairobi['N_Temperature_m'] = nairobi['Nairobi Temperature [2 m elevation corrected]']
nairobi['N_Precipitation_Total'] = nairobi['Nairobi Precipitation Total']
nairobi['N_Snowfall_Amount'] = nairobi['Nairobi Snowfall Amount']
nairobi['N_Relative_Humidity_m'] = nairobi['Nairobi Relative Humidity [2 m]']
nairobi['N_Wind_Speed_10m'] = nairobi['Nairobi Wind Speed [10 m]']
nairobi['N_Cloud_Cover_Total'] = nairobi['Nairobi Cloud Cover Total']
nairobi['N_Sunshine_Duration'] = nairobi['Nairobi Sunshine Duration']
nairobi['N_Shortwave_Radiation'] = nairobi['Nairobi Shortwave Radiation']
nairobi['N_Mean_Sea_Level_Pressure'] = nairobi['Nairobi Mean Sea Level Pressure [MSL]']
nairobi['N_Soil_Temperature_10cm'] = nairobi['Nairobi Soil Temperature [0-10 cm down]']
nairobi['N_Soil_Moisture_10cm'] = nairobi['Nairobi Soil Moisture [0-10 cm down]']
nairobi['N_Wind_Direction_Dominant_10m'] = nairobi['Nairobi Wind Direction Dominant [10 m]']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13150 entries, 0 to 13149
Data columns (total 25 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   timestamp                                      13150 non-null  object 
 1   Nairobi Temperature [2 m elevation corrected]  13150 non-null  float64
 2   Nairobi Precipitation Total                    13150 non-null  float64
 3   Nairobi Snowfall Amount                        13150 non-null  int64  
 4   Nairobi Relative Humidity [2 m]                13150 non-null  int64  
 5   Nairobi Wind Speed [10 m]                      13150 non-null  float64
 6   Nairobi Wind Direction Dominant [10 m]         13150 non-null  float64
 7   Nairobi Cloud Cover Total                      13150 non-null  float64
 8   Nairobi Sunshine Duration                      13150 non-null  float64
 9   Nairobi Shortwave Radiation                    131

In [30]:
#Deleting the initial columns after renaming

del nairobi['Nairobi Temperature [2 m elevation corrected]']
del nairobi['Nairobi Precipitation Total']
del nairobi['Nairobi Snowfall Amount']
del nairobi['Nairobi Relative Humidity [2 m]']
del nairobi['Nairobi Wind Speed [10 m]']
del nairobi['Nairobi Cloud Cover Total']
del nairobi['Nairobi Sunshine Duration']
del nairobi['Nairobi Shortwave Radiation']
del nairobi['Nairobi Mean Sea Level Pressure [MSL]']
del nairobi['Nairobi Soil Temperature [0-10 cm down]']
del nairobi['Nairobi Soil Moisture [0-10 cm down]']
del nairobi['Nairobi Wind Direction Dominant [10 m]']
nairobi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13150 entries, 0 to 13149
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   timestamp                      13150 non-null  object 
 1   N_Temperature_m                13150 non-null  float64
 2   N_Precipitation_Total          13150 non-null  float64
 3   N_Snowfall_Amount              13150 non-null  int64  
 4   N_Relative_Humidity_m          13150 non-null  int64  
 5   N_Wind_Speed_10m               13150 non-null  float64
 6   N_Cloud_Cover_Total            13150 non-null  float64
 7   N_Sunshine_Duration            13150 non-null  float64
 8   N_Shortwave_Radiation          13150 non-null  float64
 9   N_Mean_Sea_Level_Pressure      13150 non-null  float64
 10  N_Soil_Temperature_10cm        13150 non-null  float64
 11  N_Soil_Moisture_10cm           13150 non-null  float64
 12  N_Wind_Direction_Dominant_10m  13150 non-null 

In [28]:
# Checking the Cape Town Dataset Info
cape_town.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13150 entries, 0 to 13149
Data columns (total 13 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   timestamp                                        13150 non-null  object 
 1   Cape Town Temperature [2 m elevation corrected]  13150 non-null  float64
 2   Cape Town Precipitation Total                    13150 non-null  float64
 3   Cape Town Snowfall Amount                        13150 non-null  int64  
 4   Cape Town Relative Humidity [2 m]                13150 non-null  int64  
 5   Cape Town Wind Speed [10 m]                      13150 non-null  float64
 6   Cape Town Wind Direction Dominant [10 m]         13150 non-null  float64
 7   Cape Town Cloud Cover Total                      13150 non-null  float64
 8   Cape Town Sunshine Duration                      13150 non-null  float64
 9   Cape Town Shortwave Radiatio

In [29]:
cape_town['CT_Temperature_m'] = cape_town['Cape Town Temperature [2 m elevation corrected]']
cape_town['CT_Precipitation_Total'] = cape_town['Cape Town Precipitation Total']
cape_town['CT_Snowfall_Amount'] = cape_town['Cape Town Snowfall Amount']
cape_town['CT_Relative_Humidity_m'] = cape_town['Cape Town Relative Humidity [2 m]']
cape_town['CT_Wind_Speed_10m'] = cape_town['Cape Town Wind Speed [10 m]']
cape_town['CT_Cloud_Cover_Total'] = cape_town['Cape Town Cloud Cover Total']
cape_town['CT_Sunshine_Duration'] = cape_town['Cape Town Sunshine Duration']
cape_town['CT_Shortwave_Radiation'] = cape_town['Cape Town Shortwave Radiation']
cape_town['CT_Mean_Sea_Level_Pressure'] = cape_town['Cape Town Mean Sea Level Pressure [MSL]']
cape_town['CT_Soil_Temperature_10cm'] = cape_town['Cape Town Soil Temperature [0-10 cm down]']
cape_town['CT_Soil_Moisture_10cm'] = cape_town['Cape Town Soil Moisture [0-10 cm down]']
cape_town['CT_Wind_Direction_Dominant_10m'] = cape_town['Cape Town Wind Direction Dominant [10 m]']
cape_town.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13150 entries, 0 to 13149
Data columns (total 25 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   timestamp                                        13150 non-null  object 
 1   Cape Town Temperature [2 m elevation corrected]  13150 non-null  float64
 2   Cape Town Precipitation Total                    13150 non-null  float64
 3   Cape Town Snowfall Amount                        13150 non-null  int64  
 4   Cape Town Relative Humidity [2 m]                13150 non-null  int64  
 5   Cape Town Wind Speed [10 m]                      13150 non-null  float64
 6   Cape Town Wind Direction Dominant [10 m]         13150 non-null  float64
 7   Cape Town Cloud Cover Total                      13150 non-null  float64
 8   Cape Town Sunshine Duration                      13150 non-null  float64
 9   Cape Town Shortwave Radiatio

In [34]:
cape_town.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13150 entries, 0 to 13149
Data columns (total 12 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   timestamp                                        13150 non-null  object 
 1   Cape Town Temperature [2 m elevation corrected]  13150 non-null  float64
 2   Cape Town Precipitation Total                    13150 non-null  float64
 3   Cape Town Snowfall Amount                        13150 non-null  int64  
 4   Cape Town Relative Humidity [2 m]                13150 non-null  int64  
 5   Cape Town Wind Speed [10 m]                      13150 non-null  float64
 6   Cape Town Cloud Cover Total                      13150 non-null  float64
 7   Cape Town Sunshine Duration                      13150 non-null  float64
 8   Cape Town Shortwave Radiation                    13150 non-null  float64
 9   Cape Town Mean Sea Level Pre

In [35]:
algiers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13150 entries, 0 to 13149
Data columns (total 13 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   timestamp                                      13150 non-null  object 
 1   Algiers Temperature [2 m elevation corrected]  13150 non-null  float64
 2   Algiers Precipitation Total                    13150 non-null  float64
 3   Algiers Snowfall Amount                        13150 non-null  int64  
 4   Algiers Relative Humidity [2 m]                13150 non-null  int64  
 5   Algiers Wind Speed [10 m]                      13150 non-null  float64
 6   Algiers Wind Direction Dominant [10 m]         13150 non-null  float64
 7   Algiers Cloud Cover Total                      13150 non-null  float64
 8   Algiers Sunshine Duration                      13150 non-null  float64
 9   Algiers Shortwave Radiation                    131

In [36]:
#Renaming the columns to a shorter name
algiers['A_Temperature_m'] = algiers['Algiers Temperature [2 m elevation corrected]']
algiers['A_Precipitation_Total'] = algiers['Algiers Precipitation Total']
algiers['A_Snowfall_Amount'] = algiers['Algiers Snowfall Amount']
algiers['A_Relative_Humidity_m'] = algiers['Algiers Relative Humidity [2 m]']
algiers['A_Wind_Speed_10m'] = algiers['Algiers Wind Speed [10 m]']
algiers['A_Cloud_Cover_Total'] = algiers['Algiers Cloud Cover Total']
algiers['A_Sunshine_Duration'] = algiers['Algiers Sunshine Duration']
algiers['A_Shortwave_Radiation'] = algiers['Algiers Shortwave Radiation']
algiers['A_Mean_Sea_Level_Pressure'] = algiers['Algiers Mean Sea Level Pressure [MSL]']
algiers['A_Soil_Temperature_10cm'] = algiers['Algiers Soil Temperature [0-10 cm down]']
algiers['A_Soil_Moisture_10cm'] = algiers['Algiers Soil Moisture [0-10 cm down]']
algiers['A_Wind_Direction_Dominant_10m'] = algiers['Algiers Wind Direction Dominant [10 m]']
algiers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13150 entries, 0 to 13149
Data columns (total 25 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   timestamp                                      13150 non-null  object 
 1   Algiers Temperature [2 m elevation corrected]  13150 non-null  float64
 2   Algiers Precipitation Total                    13150 non-null  float64
 3   Algiers Snowfall Amount                        13150 non-null  int64  
 4   Algiers Relative Humidity [2 m]                13150 non-null  int64  
 5   Algiers Wind Speed [10 m]                      13150 non-null  float64
 6   Algiers Wind Direction Dominant [10 m]         13150 non-null  float64
 7   Algiers Cloud Cover Total                      13150 non-null  float64
 8   Algiers Sunshine Duration                      13150 non-null  float64
 9   Algiers Shortwave Radiation                    131

In [37]:
#Deleting the initial columns after renaming
del algiers['Algiers Temperature [2 m elevation corrected]']
del algiers['Algiers Precipitation Total']
del algiers['Algiers Snowfall Amount']
del algiers['Algiers Relative Humidity [2 m]']
del algiers['Algiers Wind Speed [10 m]']
del algiers['Algiers Cloud Cover Total']
del algiers['Algiers Sunshine Duration']
del algiers['Algiers Shortwave Radiation']
del algiers['Algiers Mean Sea Level Pressure [MSL]']
del algiers['Algiers Soil Temperature [0-10 cm down]']
del algiers['Algiers Soil Moisture [0-10 cm down]']
del algiers['Algiers Wind Direction Dominant [10 m]']

In [38]:
algiers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13150 entries, 0 to 13149
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   timestamp                      13150 non-null  object 
 1   A_Temperature_m                13150 non-null  float64
 2   A_Precipitation_Total          13150 non-null  float64
 3   A_Snowfall_Amount              13150 non-null  int64  
 4   A_Relative_Humidity_m          13150 non-null  int64  
 5   A_Wind_Speed_10m               13150 non-null  float64
 6   A_Cloud_Cover_Total            13150 non-null  float64
 7   A_Sunshine_Duration            13150 non-null  float64
 8   A_Shortwave_Radiation          13150 non-null  float64
 9   A_Mean_Sea_Level_Pressure      13150 non-null  float64
 10  A_Soil_Temperature_10cm        13150 non-null  float64
 11  A_Soil_Moisture_10cm           13150 non-null  float64
 12  A_Wind_Direction_Dominant_10m  13150 non-null 

In [39]:
#Rchecking all datasets so far
afr.head(); clim.head(); nrj.head(); glob.head(); cont_africa_edit.head();nairobi.head();cape_town.head();algiers.head()

Unnamed: 0,Indicator,IndicatorName,Region,RegionName,Unit,Date,Value
0,102,"Final Consumption of coking coal, 1000 tonnes",10203,Ethiopia,thousand tonnes,2006,11.0
1,102,"Final Consumption of coking coal, 1000 tonnes",10203,Ethiopia,thousand tonnes,2007,6.0
2,102,"Final Consumption of coking coal, 1000 tonnes",10203,Ethiopia,thousand tonnes,2008,15.0
3,102,"Final Consumption of coking coal, 1000 tonnes",10203,Ethiopia,thousand tonnes,2009,25.0
4,102,"Final Consumption of coking coal, 1000 tonnes",10203,Ethiopia,thousand tonnes,2010,50.0


Unnamed: 0,Country Name,Country ISO3,Year,Indicator Name,Indicator Code,Value
0,#country+name,#country+code,#date+year,#indicator+name,#indicator+code,#indicator+value+num
1,Kenya,KEN,2018,Agricultural land (sq. km),AG.LND.AGRI.K2,276300
2,Kenya,KEN,2017,Agricultural land (sq. km),AG.LND.AGRI.K2,276300
3,Kenya,KEN,2016,Agricultural land (sq. km),AG.LND.AGRI.K2,276300
4,Kenya,KEN,2015,Agricultural land (sq. km),AG.LND.AGRI.K2,276300


Unnamed: 0,Country Name,Country ISO3,Year,Indicator Name,Indicator Code,Value
0,#country+name,#country+code,#date+year,#indicator+name,#indicator+code,#indicator+value+num
1,Kenya,KEN,2015,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.845690551
2,Kenya,KEN,2014,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.825663741
3,Kenya,KEN,2013,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.439852612
4,Kenya,KEN,2012,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.434618378


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
1,1743-12-01,,,Åland
2,1744-01-01,,,Åland
3,1744-02-01,,,Åland
4,1744-03-01,,,Åland


Unnamed: 0,name,region,sub-region
3,Algeria,Africa,Northern Africa
6,Angola,Africa,Sub-Saharan Africa
23,Benin,Africa,Sub-Saharan Africa
29,Botswana,Africa,Sub-Saharan Africa
32,British Indian Ocean Territory,Africa,Sub-Saharan Africa


Unnamed: 0,timestamp,N_Temperature_m,N_Precipitation_Total,N_Snowfall_Amount,N_Relative_Humidity_m,N_Wind_Speed_10m,N_Cloud_Cover_Total,N_Sunshine_Duration,N_Shortwave_Radiation,N_Mean_Sea_Level_Pressure,N_Soil_Temperature_10cm,N_Soil_Moisture_10cm,N_Wind_Direction_Dominant_10m
0,19850901T0000,69.62904,0.0,0,84,10.398747,93.291664,95.106384,3090.08,1017.5,74.138,0.172,112.53505
1,19850902T0000,71.60905,0.0,0,86,11.937781,86.625,188.29787,3262.7397,1017.8,75.27202,0.172,110.25738
2,19850903T0000,71.64504,0.0,0,88,10.664815,96.625,47.872337,3485.24,1018.3,76.26199,0.193,108.77132
3,19850904T0000,70.79904,0.0,0,85,8.518,94.166664,81.702126,2620.1597,1018.6,73.52602,0.202,107.01516
4,19850905T0000,73.30104,0.0,0,86,10.187275,68.375,426.11063,4981.33,1017.1,78.72798,0.202,106.76452


Unnamed: 0,timestamp,Cape Town Temperature [2 m elevation corrected],Cape Town Precipitation Total,Cape Town Snowfall Amount,Cape Town Relative Humidity [2 m],Cape Town Wind Speed [10 m],Cape Town Cloud Cover Total,Cape Town Sunshine Duration,Cape Town Shortwave Radiation,Cape Town Mean Sea Level Pressure [MSL],Cape Town Soil Temperature [0-10 cm down],Cape Town Soil Moisture [0-10 cm down]
0,19850901T0000,83.85659,0.0,0,99,4.457064,15.708333,675.83484,5135.3003,1017.0,75.56002,0.145
1,19850902T0000,67.00859,0.0,0,96,7.161691,93.0,95.744675,4807.7803,1015.1,67.046005,0.141
2,19850903T0000,63.67859,0.0,0,92,8.108701,65.666664,105.773056,2403.0,1020.8,62.636032,0.137
3,19850904T0000,69.58259,0.0,0,90,6.18706,52.82083,379.96277,5029.3896,1021.2,67.36999,0.14
4,19850905T0000,71.58059,0.0,0,100,7.644957,49.208332,423.07584,5007.1406,1023.0,69.94403,0.14


Unnamed: 0,timestamp,A_Temperature_m,A_Precipitation_Total,A_Snowfall_Amount,A_Relative_Humidity_m,A_Wind_Speed_10m,A_Cloud_Cover_Total,A_Sunshine_Duration,A_Shortwave_Radiation,A_Mean_Sea_Level_Pressure,A_Soil_Temperature_10cm,A_Soil_Moisture_10cm,A_Wind_Direction_Dominant_10m
0,19850901T0000,98.53823,0.0,0,34,11.739121,2.666667,738.549,6974.929,1017.6,97.79003,0.112,93.81408
1,19850902T0000,102.46223,0.0,0,46,10.774512,3.208333,728.0011,6875.2505,1016.5,101.246,0.112,88.51213
2,19850903T0000,99.09622,0.0,0,71,8.408619,11.666667,652.29895,6763.1094,1019.5,99.661995,0.112,272.03503
3,19850904T0000,87.99023,0.0,0,96,15.634569,22.833334,511.31512,6613.591,1021.8,92.17398,0.112,62.35402
4,19850905T0000,89.79022,0.0,0,95,16.150927,13.333333,711.33905,6590.45,1021.6,92.96599,0.113,74.99781


In [40]:
#Removing first lines of clim and nrj tables

clim_edit = clim.drop(0); clim_edit.head(); 
nrj_edit = nrj.drop(0); nrj_edit.head()

Unnamed: 0,Country Name,Country ISO3,Year,Indicator Name,Indicator Code,Value
1,Kenya,KEN,2018,Agricultural land (sq. km),AG.LND.AGRI.K2,276300
2,Kenya,KEN,2017,Agricultural land (sq. km),AG.LND.AGRI.K2,276300
3,Kenya,KEN,2016,Agricultural land (sq. km),AG.LND.AGRI.K2,276300
4,Kenya,KEN,2015,Agricultural land (sq. km),AG.LND.AGRI.K2,276300
5,Kenya,KEN,2014,Agricultural land (sq. km),AG.LND.AGRI.K2,276300


Unnamed: 0,Country Name,Country ISO3,Year,Indicator Name,Indicator Code,Value
1,Kenya,KEN,2015,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.845690551
2,Kenya,KEN,2014,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.825663741
3,Kenya,KEN,2013,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.439852612
4,Kenya,KEN,2012,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.434618378
5,Kenya,KEN,2011,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.67933551


### Checking that the types are correct in each table and correct when necessary

In [41]:
afr.head();afr.dtypes

Unnamed: 0,Indicator,IndicatorName,Region,RegionName,Unit,Date,Value
0,102,"Final Consumption of coking coal, 1000 tonnes",10203,Ethiopia,thousand tonnes,2006,11.0
1,102,"Final Consumption of coking coal, 1000 tonnes",10203,Ethiopia,thousand tonnes,2007,6.0
2,102,"Final Consumption of coking coal, 1000 tonnes",10203,Ethiopia,thousand tonnes,2008,15.0
3,102,"Final Consumption of coking coal, 1000 tonnes",10203,Ethiopia,thousand tonnes,2009,25.0
4,102,"Final Consumption of coking coal, 1000 tonnes",10203,Ethiopia,thousand tonnes,2010,50.0


Indicator          int64
IndicatorName     object
Region             int64
RegionName        object
Unit              object
Date               int64
Value            float64
dtype: object

In [42]:
clim_edit['Year'] = pd.to_datetime(clim_edit['Year']);
clim_edit['Value'] = clim_edit['Value'].astype(float);
clim_edit.head();clim_edit.dtypes

Unnamed: 0,Country Name,Country ISO3,Year,Indicator Name,Indicator Code,Value
1,Kenya,KEN,2018-01-01,Agricultural land (sq. km),AG.LND.AGRI.K2,276300.0
2,Kenya,KEN,2017-01-01,Agricultural land (sq. km),AG.LND.AGRI.K2,276300.0
3,Kenya,KEN,2016-01-01,Agricultural land (sq. km),AG.LND.AGRI.K2,276300.0
4,Kenya,KEN,2015-01-01,Agricultural land (sq. km),AG.LND.AGRI.K2,276300.0
5,Kenya,KEN,2014-01-01,Agricultural land (sq. km),AG.LND.AGRI.K2,276300.0


Country Name              object
Country ISO3              object
Year              datetime64[ns]
Indicator Name            object
Indicator Code            object
Value                    float64
dtype: object

In [43]:
nrj_edit['Year'] = pd.to_datetime(nrj_edit['Year']);
nrj_edit['Value'] = nrj_edit['Value'].astype(float);
nrj_edit.head();nrj_edit.dtypes

Unnamed: 0,Country Name,Country ISO3,Year,Indicator Name,Indicator Code,Value
1,Kenya,KEN,2015-01-01,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.845691
2,Kenya,KEN,2014-01-01,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.825664
3,Kenya,KEN,2013-01-01,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.439853
4,Kenya,KEN,2012-01-01,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.434618
5,Kenya,KEN,2011-01-01,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.679336


Country Name              object
Country ISO3              object
Year              datetime64[ns]
Indicator Name            object
Indicator Code            object
Value                    float64
dtype: object

In [44]:
glob_edit['dt'] = pd.to_datetime(glob_edit['dt']);
glob_edit.head();glob_edit.dtypes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  glob_edit['dt'] = pd.to_datetime(glob_edit['dt']);


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
5,1744-04-01,1.53,4.68,Åland
6,1744-05-01,6.702,1.789,Åland
7,1744-06-01,11.609,1.577,Åland
8,1744-07-01,15.342,1.41,Åland


dt                               datetime64[ns]
AverageTemperature                      float64
AverageTemperatureUncertainty           float64
Country                                  object
dtype: object

### Checking the Data Time ranges for all datasets (except Continent)

In [45]:
#Creating a common column for the year, names "Year_extract"

afr['Year_extract'] = afr['Date'];
clim_edit['Year_extract'] = clim_edit['Year'].apply(lambda t: t.year);clim_edit.head();
nrj_edit['Year_extract'] = nrj_edit['Year'].apply(lambda t: t.year);nrj_edit.head();
glob_edit['Year_extract'] = glob_edit['dt'].apply(lambda t: t.year);glob_edit.head()


Unnamed: 0,Country Name,Country ISO3,Year,Indicator Name,Indicator Code,Value,Year_extract
1,Kenya,KEN,2018-01-01,Agricultural land (sq. km),AG.LND.AGRI.K2,276300.0,2018
2,Kenya,KEN,2017-01-01,Agricultural land (sq. km),AG.LND.AGRI.K2,276300.0,2017
3,Kenya,KEN,2016-01-01,Agricultural land (sq. km),AG.LND.AGRI.K2,276300.0,2016
4,Kenya,KEN,2015-01-01,Agricultural land (sq. km),AG.LND.AGRI.K2,276300.0,2015
5,Kenya,KEN,2014-01-01,Agricultural land (sq. km),AG.LND.AGRI.K2,276300.0,2014


Unnamed: 0,Country Name,Country ISO3,Year,Indicator Name,Indicator Code,Value,Year_extract
1,Kenya,KEN,2015-01-01,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.845691,2015
2,Kenya,KEN,2014-01-01,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.825664,2014
3,Kenya,KEN,2013-01-01,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.439853,2013
4,Kenya,KEN,2012-01-01,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.434618,2012
5,Kenya,KEN,2011-01-01,Energy intensity level of primary energy (MJ/$...,EG.EGY.PRIM.PP.KD,7.679336,2011


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  glob_edit['Year_extract'] = glob_edit['dt'].apply(lambda t: t.year);glob_edit.head()


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country,Year_extract
0,1743-11-01,4.384,2.294,Åland,1743
5,1744-04-01,1.53,4.68,Åland,1744
6,1744-05-01,6.702,1.789,Åland,1744
7,1744-06-01,11.609,1.577,Åland,1744
8,1744-07-01,15.342,1.41,Åland,1744


In [46]:
print("for afr table, Date min and max are: ", afr['Year_extract'].min(), afr['Year_extract'].max());
print("for clim table, Date min and max are: ", clim_edit['Year_extract'].min(), clim_edit['Year_extract'].max());
print("for nrj table, Date min and max are: ", nrj_edit['Year_extract'].min(), nrj_edit['Year_extract'].max())
print("for glob table, Date min and max are: ", glob_edit['Year_extract'].min(), glob_edit['Year_extract'].max())


for afr table, Date min and max are:  2000 2014
for clim table, Date min and max are:  1960 2020
for nrj table, Date min and max are:  1960 2020
for glob table, Date min and max are:  1743 2013


### We will mainly use the clim and nrj tables for data ranging from 1985 to 2020 to match the data from Nairobi, Cape Town and Algiers (1985 to 2021)

In [47]:
#Rechecking Data Integrity

afr.count();afr.shape;clim_edit.count();clim_edit.shape;nrj_edit.count();nrj_edit.shape;glob_edit.count();glob_edit.shape;
cont_africa_edit.count();cont_africa_edit.shape

Indicator        17018
IndicatorName    17018
Region           17018
RegionName       17018
Unit             17018
Date             17018
Value            17018
Year_extract     17018
dtype: int64

(17018, 8)

Country Name      1952
Country ISO3      1952
Year              1952
Indicator Name    1952
Indicator Code    1952
Value             1952
Year_extract      1952
dtype: int64

(1952, 7)

Country Name      1081
Country ISO3      1081
Year              1081
Indicator Name    1081
Indicator Code    1081
Value             1081
Year_extract      1081
dtype: int64

(1081, 7)

dt                               544811
AverageTemperature               544811
AverageTemperatureUncertainty    544811
Country                          544811
Year_extract                     544811
dtype: int64

(544811, 5)

name          59
region        59
sub-region    59
dtype: int64

(59, 3)

In [48]:
nairobi['timestamp'] = pd.to_datetime(nairobi['timestamp']);
nairobi.dtypes;nairobi.head()

cape_town['timestamp'] = pd.to_datetime(cape_town['timestamp']);
cape_town.dtypes;cape_town.head()

algiers['timestamp'] = pd.to_datetime(algiers['timestamp']);
algiers.dtypes;algiers.head()

timestamp                        datetime64[ns]
N_Temperature_m                         float64
N_Precipitation_Total                   float64
N_Snowfall_Amount                         int64
N_Relative_Humidity_m                     int64
N_Wind_Speed_10m                        float64
N_Cloud_Cover_Total                     float64
N_Sunshine_Duration                     float64
N_Shortwave_Radiation                   float64
N_Mean_Sea_Level_Pressure               float64
N_Soil_Temperature_10cm                 float64
N_Soil_Moisture_10cm                    float64
N_Wind_Direction_Dominant_10m           float64
dtype: object

Unnamed: 0,timestamp,N_Temperature_m,N_Precipitation_Total,N_Snowfall_Amount,N_Relative_Humidity_m,N_Wind_Speed_10m,N_Cloud_Cover_Total,N_Sunshine_Duration,N_Shortwave_Radiation,N_Mean_Sea_Level_Pressure,N_Soil_Temperature_10cm,N_Soil_Moisture_10cm,N_Wind_Direction_Dominant_10m
0,1985-09-01,69.62904,0.0,0,84,10.398747,93.291664,95.106384,3090.08,1017.5,74.138,0.172,112.53505
1,1985-09-02,71.60905,0.0,0,86,11.937781,86.625,188.29787,3262.7397,1017.8,75.27202,0.172,110.25738
2,1985-09-03,71.64504,0.0,0,88,10.664815,96.625,47.872337,3485.24,1018.3,76.26199,0.193,108.77132
3,1985-09-04,70.79904,0.0,0,85,8.518,94.166664,81.702126,2620.1597,1018.6,73.52602,0.202,107.01516
4,1985-09-05,73.30104,0.0,0,86,10.187275,68.375,426.11063,4981.33,1017.1,78.72798,0.202,106.76452


timestamp                                          datetime64[ns]
Cape Town Temperature [2 m elevation corrected]           float64
Cape Town Precipitation Total                             float64
Cape Town Snowfall Amount                                   int64
Cape Town Relative Humidity [2 m]                           int64
Cape Town Wind Speed [10 m]                               float64
Cape Town Cloud Cover Total                               float64
Cape Town Sunshine Duration                               float64
Cape Town Shortwave Radiation                             float64
Cape Town Mean Sea Level Pressure [MSL]                   float64
Cape Town Soil Temperature [0-10 cm down]                 float64
Cape Town Soil Moisture [0-10 cm down]                    float64
dtype: object

Unnamed: 0,timestamp,Cape Town Temperature [2 m elevation corrected],Cape Town Precipitation Total,Cape Town Snowfall Amount,Cape Town Relative Humidity [2 m],Cape Town Wind Speed [10 m],Cape Town Cloud Cover Total,Cape Town Sunshine Duration,Cape Town Shortwave Radiation,Cape Town Mean Sea Level Pressure [MSL],Cape Town Soil Temperature [0-10 cm down],Cape Town Soil Moisture [0-10 cm down]
0,1985-09-01,83.85659,0.0,0,99,4.457064,15.708333,675.83484,5135.3003,1017.0,75.56002,0.145
1,1985-09-02,67.00859,0.0,0,96,7.161691,93.0,95.744675,4807.7803,1015.1,67.046005,0.141
2,1985-09-03,63.67859,0.0,0,92,8.108701,65.666664,105.773056,2403.0,1020.8,62.636032,0.137
3,1985-09-04,69.58259,0.0,0,90,6.18706,52.82083,379.96277,5029.3896,1021.2,67.36999,0.14
4,1985-09-05,71.58059,0.0,0,100,7.644957,49.208332,423.07584,5007.1406,1023.0,69.94403,0.14


timestamp                        datetime64[ns]
A_Temperature_m                         float64
A_Precipitation_Total                   float64
A_Snowfall_Amount                         int64
A_Relative_Humidity_m                     int64
A_Wind_Speed_10m                        float64
A_Cloud_Cover_Total                     float64
A_Sunshine_Duration                     float64
A_Shortwave_Radiation                   float64
A_Mean_Sea_Level_Pressure               float64
A_Soil_Temperature_10cm                 float64
A_Soil_Moisture_10cm                    float64
A_Wind_Direction_Dominant_10m           float64
dtype: object

Unnamed: 0,timestamp,A_Temperature_m,A_Precipitation_Total,A_Snowfall_Amount,A_Relative_Humidity_m,A_Wind_Speed_10m,A_Cloud_Cover_Total,A_Sunshine_Duration,A_Shortwave_Radiation,A_Mean_Sea_Level_Pressure,A_Soil_Temperature_10cm,A_Soil_Moisture_10cm,A_Wind_Direction_Dominant_10m
0,1985-09-01,98.53823,0.0,0,34,11.739121,2.666667,738.549,6974.929,1017.6,97.79003,0.112,93.81408
1,1985-09-02,102.46223,0.0,0,46,10.774512,3.208333,728.0011,6875.2505,1016.5,101.246,0.112,88.51213
2,1985-09-03,99.09622,0.0,0,71,8.408619,11.666667,652.29895,6763.1094,1019.5,99.661995,0.112,272.03503
3,1985-09-04,87.99023,0.0,0,96,15.634569,22.833334,511.31512,6613.591,1021.8,92.17398,0.112,62.35402
4,1985-09-05,89.79022,0.0,0,95,16.150927,13.333333,711.33905,6590.45,1021.6,92.96599,0.113,74.99781


Saving the cleaned data for the next steps

In [49]:
# Rechecking missing values after dropping nan values

nairobi = pd.concat([nairobi.isnull().sum(), 100 * nairobi.isnull().mean()], axis=1);
nairobi.columns=['count','%']
nairobi.sort_values(by=['count'])
nairobi.isna()

Unnamed: 0,count,%
timestamp,0,0.0
N_Temperature_m,0,0.0
N_Precipitation_Total,0,0.0
N_Snowfall_Amount,0,0.0
N_Relative_Humidity_m,0,0.0
N_Wind_Speed_10m,0,0.0
N_Cloud_Cover_Total,0,0.0
N_Sunshine_Duration,0,0.0
N_Shortwave_Radiation,0,0.0
N_Mean_Sea_Level_Pressure,0,0.0


Unnamed: 0,count,%
timestamp,False,False
N_Temperature_m,False,False
N_Precipitation_Total,False,False
N_Snowfall_Amount,False,False
N_Relative_Humidity_m,False,False
N_Wind_Speed_10m,False,False
N_Cloud_Cover_Total,False,False
N_Sunshine_Duration,False,False
N_Shortwave_Radiation,False,False
N_Mean_Sea_Level_Pressure,False,False


In [50]:
algiers.head(); cape_town.head(); nairobi.head(); algiers.shape; cape_town.shape; nairobi.shape

Unnamed: 0,timestamp,A_Temperature_m,A_Precipitation_Total,A_Snowfall_Amount,A_Relative_Humidity_m,A_Wind_Speed_10m,A_Cloud_Cover_Total,A_Sunshine_Duration,A_Shortwave_Radiation,A_Mean_Sea_Level_Pressure,A_Soil_Temperature_10cm,A_Soil_Moisture_10cm,A_Wind_Direction_Dominant_10m
0,1985-09-01,98.53823,0.0,0,34,11.739121,2.666667,738.549,6974.929,1017.6,97.79003,0.112,93.81408
1,1985-09-02,102.46223,0.0,0,46,10.774512,3.208333,728.0011,6875.2505,1016.5,101.246,0.112,88.51213
2,1985-09-03,99.09622,0.0,0,71,8.408619,11.666667,652.29895,6763.1094,1019.5,99.661995,0.112,272.03503
3,1985-09-04,87.99023,0.0,0,96,15.634569,22.833334,511.31512,6613.591,1021.8,92.17398,0.112,62.35402
4,1985-09-05,89.79022,0.0,0,95,16.150927,13.333333,711.33905,6590.45,1021.6,92.96599,0.113,74.99781


Unnamed: 0,timestamp,Cape Town Temperature [2 m elevation corrected],Cape Town Precipitation Total,Cape Town Snowfall Amount,Cape Town Relative Humidity [2 m],Cape Town Wind Speed [10 m],Cape Town Cloud Cover Total,Cape Town Sunshine Duration,Cape Town Shortwave Radiation,Cape Town Mean Sea Level Pressure [MSL],Cape Town Soil Temperature [0-10 cm down],Cape Town Soil Moisture [0-10 cm down]
0,1985-09-01,83.85659,0.0,0,99,4.457064,15.708333,675.83484,5135.3003,1017.0,75.56002,0.145
1,1985-09-02,67.00859,0.0,0,96,7.161691,93.0,95.744675,4807.7803,1015.1,67.046005,0.141
2,1985-09-03,63.67859,0.0,0,92,8.108701,65.666664,105.773056,2403.0,1020.8,62.636032,0.137
3,1985-09-04,69.58259,0.0,0,90,6.18706,52.82083,379.96277,5029.3896,1021.2,67.36999,0.14
4,1985-09-05,71.58059,0.0,0,100,7.644957,49.208332,423.07584,5007.1406,1023.0,69.94403,0.14


Unnamed: 0,count,%
timestamp,0,0.0
N_Temperature_m,0,0.0
N_Precipitation_Total,0,0.0
N_Snowfall_Amount,0,0.0
N_Relative_Humidity_m,0,0.0


(13150, 13)

(13150, 12)

(13, 2)

In [51]:
#Dropping NaN values
narirobi_edit = nairobi.dropna();
algiers_edit = nairobi.dropna();
cape_town_edit = nairobi.dropna();

In [52]:
afr.to_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\africa_clean.csv')
clim_edit.to_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\climate_clean.csv')
nrj_edit.to_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\energy_clean.csv')
glob_edit.to_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\global_clean.csv')
cont_africa_edit.to_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\continent_clean.csv')
nairobi.to_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\nairobi_clean.csv')
cape_town.to_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\cape_town_clean.csv')
algiers.to_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\algiers_clean.csv')

## Part 2: Load those datasets, select the necessary data and create one final starting table

In [53]:
#Load the data

os.chdir("C:\\Users\\moham\\OneDrive\\Desktop\\Other\\Bootcamp\\Capstone_2\\ProjectIdeas_Initial\\Processed_Cleaned")

afr=pd.read_csv('africa_clean.csv')
clim=pd.read_csv('climate_clean.csv')
nrj=pd.read_csv('energy_clean.csv')
glob=pd.read_csv('global_clean.csv')
cont_africa_edit=pd.read_csv('continent_clean.csv')
nairobi=pd.read_csv('nair.csv')
cape_town=pd.read_csv('cape.csv')
algiers=pd.read_csv('alg.csv')

In [54]:
# Extracting data starting in 1985 for the "NRJ" dataset
nrj_test = nrj[nrj['Year_extract']>1984].sort_values(by="Year")
nrj_test['Year_extract'].describe()
nrj_test['Indicator Name'].unique()
del nrj_test['Country Name']
del nrj_test['Indicator Code']
nrj_test
nrj_test_T = nrj_test.T
nrj_test_T

count     803.000000
mean     2002.415940
std         9.637124
min      1985.000000
25%      1994.500000
50%      2003.000000
75%      2010.000000
max      2020.000000
Name: Year_extract, dtype: float64

array(['Ores and metals exports (% of merchandise exports)',
       'Adjusted savings: mineral depletion (% of GNI)',
       'Adjusted savings: energy depletion (current US$)',
       'CO2 emissions from gaseous fuel consumption (% of total)',
       'Adjusted savings: energy depletion (% of GNI)',
       'Nitrous oxide emissions in energy sector (thousand metric tons of CO2 equivalent)',
       'Adjusted savings: natural resources depletion (% of GNI)',
       'Mineral rents (% of GDP)',
       'Energy related methane emissions (% of total)',
       'Nitrous oxide emissions in energy sector (% of total)',
       'Natural gas rents (% of GDP)', 'Oil rents (% of GDP)',
       'Methane emissions in energy sector (thousand metric tons of CO2 equivalent)',
       'Total natural resources rents (% of GDP)',
       'Fuel imports (% of merchandise imports)',
       'CO2 emissions from liquid fuel consumption (kt)',
       'Ores and metals imports (% of merchandise imports)',
       'Fuel expo

Unnamed: 0.1,Unnamed: 0,Country ISO3,Year,Indicator Name,Value,Year_extract
1071,1072,KEN,1985-01-01,Ores and metals exports (% of merchandise expo...,2.445062e+00,1985
564,565,KEN,1985-01-01,Adjusted savings: mineral depletion (% of GNI),1.940670e-04,1985
614,615,KEN,1985-01-01,Adjusted savings: energy depletion (current US$),0.000000e+00,1985
161,162,KEN,1985-01-01,CO2 emissions from gaseous fuel consumption (%...,0.000000e+00,1985
663,664,KEN,1985-01-01,Adjusted savings: energy depletion (% of GNI),0.000000e+00,1985
...,...,...,...,...,...,...
875,876,KEN,2019-01-01,Total natural resources rents (% of GDP),1.052402e+00,2019
451,452,KEN,2019-01-01,Time required to get electricity (days),9.700000e+01,2019
77,78,KEN,2019-01-01,Access to electricity (% of population),6.970000e+01,2019
925,926,KEN,2019-01-01,Fuel imports (% of merchandise imports),1.922293e+01,2019


Unnamed: 0,1071,564,614,161,663,366,712,761,317,406,...,777,1003,480,469,826,875,451,77,925,468
Unnamed: 0,1072,565,615,162,664,367,713,762,318,407,...,778,1004,481,470,827,876,452,78,926,469
Country ISO3,KEN,KEN,KEN,KEN,KEN,KEN,KEN,KEN,KEN,KEN,...,KEN,KEN,KEN,KEN,KEN,KEN,KEN,KEN,KEN,KEN
Year,1985-01-01,1985-01-01,1985-01-01,1985-01-01,1985-01-01,1985-01-01,1985-01-01,1985-01-01,1985-01-01,1985-01-01,...,2019-01-01,2019-01-01,2019-01-01,2019-01-01,2019-01-01,2019-01-01,2019-01-01,2019-01-01,2019-01-01,2020-01-01
Indicator Name,Ores and metals exports (% of merchandise expo...,Adjusted savings: mineral depletion (% of GNI),Adjusted savings: energy depletion (current US$),CO2 emissions from gaseous fuel consumption (%...,Adjusted savings: energy depletion (% of GNI),Nitrous oxide emissions in energy sector (thou...,Adjusted savings: natural resources depletion ...,Mineral rents (% of GDP),Energy related methane emissions (% of total),Nitrous oxide emissions in energy sector (% of...,...,Natural gas rents (% of GDP),Fuel exports (% of merchandise exports),Adjusted savings: mineral depletion (current US$),Investment in energy with private participatio...,Oil rents (% of GDP),Total natural resources rents (% of GDP),Time required to get electricity (days),Access to electricity (% of population),Fuel imports (% of merchandise imports),Investment in energy with private participatio...
Value,2.44506,0.000194067,0,0,0,361.91,0.000194067,0.000664198,24.2295,4.54833,...,0,7.73708,0,2.202e+08,0.0220733,1.0524,97,69.7,19.2229,1.4716e+08
Year_extract,1985,1985,1985,1985,1985,1985,1985,1985,1985,1985,...,2019,2019,2019,2019,2019,2019,2019,2019,2019,2020


In [55]:
# Extracting data starting in 1985 for the "clim" dataset
clim_test = clim[clim['Year_extract']>1984].sort_values(by="Year")
clim_test['Year_extract'].describe()
clim_test['Indicator Name'].unique()
del clim_test['Country Name']
del clim_test['Indicator Code']
clim_test

count    1317.000000
mean     2002.100987
std         9.417518
min      1985.000000
25%      1994.000000
50%      2002.000000
75%      2010.000000
max      2020.000000
Name: Year_extract, dtype: float64

array(['CO2 emissions from liquid fuel consumption (% of total)',
       'Mortality rate, under-5 (per 1,000 live births)',
       'School enrollment, primary and secondary (gross), gender parity index (GPI)',
       'CO2 emissions (kg per 2010 US$ of GDP)',
       'Agricultural land (sq. km)',
       'Methane emissions (kt of CO2 equivalent)',
       'CO2 intensity (kg per kg of oil equivalent energy use)',
       'Agriculture, forestry, and fishing, value added (% of GDP)',
       'Population growth (annual %)',
       'CO2 emissions from solid fuel consumption (kt)',
       'CO2 emissions from solid fuel consumption (% of total)',
       'Arable land (% of land area)',
       'Population in urban agglomerations of more than 1 million (% of total population)',
       'Cereal yield (kg per hectare)',
       'Total greenhouse gas emissions (kt of CO2 equivalent)',
       'CO2 emissions (metric tons per capita)', 'Population, total',
       'Urban population', 'Urban population (% of to

Unnamed: 0.1,Unnamed: 0,Country ISO3,Year,Indicator Name,Value,Year_extract
793,794,KEN,1985-01-01,CO2 emissions from liquid fuel consumption (% ...,8.268482e+01,1985
1594,1595,KEN,1985-01-01,"Mortality rate, under-5 (per 1,000 live births)",9.720000e+01,1985
1532,1533,KEN,1985-01-01,"School enrollment, primary and secondary (gros...",8.999400e-01,1985
620,621,KEN,1985-01-01,CO2 emissions (kg per 2010 US$ of GDP),2.278105e-01,1985
33,34,KEN,1985-01-01,Agricultural land (sq. km),2.635400e+05,1985
...,...,...,...,...,...,...
1815,1816,KEN,2020-01-01,Urban population,1.505328e+07,2020
1457,1458,KEN,2020-01-01,"Agriculture, forestry, and fishing, value adde...",3.515074e+01,2020
1359,1360,KEN,2020-01-01,Population in urban agglomerations of more tha...,1.121575e+01,2020
1634,1635,KEN,2020-01-01,Population growth (annual %),2.251879e+00,2020


In [58]:
clim_test_new = pd.read_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\clim_test.csv')

In [59]:
clim_export = pd.pivot_table(clim_test_new, index = ['Year'], columns = ['Indicator Name'], values = 'Value')

In [60]:
nrj_test_new = pd.read_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\energy_clean_test.csv')
nrj_test_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Year           362 non-null    object 
 1   IndicatorName  362 non-null    object 
 2   Value          362 non-null    float64
 3   Year_extract   362 non-null    int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 11.4+ KB


In [61]:
nrj_export = pd.pivot_table(nrj_test_new, index = ['Year'], columns = ['IndicatorName'], values = 'Value')

In [62]:
nrj_test_new.head()

Unnamed: 0,Year,IndicatorName,Value,Year_extract
0,1/1/1993,Access to electricity (% of population),10.9,1993
1,1/1/1994,Access to electricity (% of population),2.298415,1994
2,1/1/1995,Access to electricity (% of population),4.471946,1995
3,1/1/1996,Access to electricity (% of population),6.647871,1996
4,1/1/1997,Access to electricity (% of population),8.826791,1997


In [63]:
nrj_export.head()

IndicatorName,Access to electricity (% of population),"Access to electricity, rural (% of rural population)","Access to electricity, urban (% of urban population)",CO2 emissions from liquid fuel consumption (kt),Energy related methane emissions (% of total),Fuel exports (% of merchandise exports),Fuel imports (% of merchandise imports),Methane emissions in energy sector (thousand metric tons of CO2 equivalent),Mineral rents (% of GDP),Nitrous oxide emissions in energy sector (thousand metric tons of CO2 equivalent),Ores and metals exports (% of merchandise exports),Total natural resources rents (% of GDP)
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1/1/1985,,,,3116.95,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726
1/1/1986,,,,3281.965,24.428353,11.419751,18.181069,4393.634721,0.003442,375.435773,2.050675,4.1205
1/1/1987,,,,4272.055,24.900715,13.561259,20.055016,4507.82615,0.017928,386.12076,2.597217,3.905213
1/1/1988,,,,3868.685,24.167097,,,4625.050677,0.002784,396.60011,,3.915231
1/1/1989,,,,4235.385,23.858318,,,4743.797113,0.001958,411.688596,,4.189091


In [64]:
nrj_latest = pd.read_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\energy_clean_export.csv')

clim_latest = pd.read_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\climate_clean_export.csv')

In [65]:
#Merge the two main datasets on the extracted "Year" column
merge_test = pd.merge(nrj_latest, clim_latest, how="left", on="Year")

In [66]:
merge_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35 entries, 0 to 34
Data columns (total 72 columns):
 #   Column                                                                                      Non-Null Count  Dtype  
---  ------                                                                                      --------------  -----  
 0   Year                                                                                        35 non-null     object 
 1   Access to electricity (% of population)_x                                                   27 non-null     float64
 2   Access to electricity, rural (% of rural population)                                        24 non-null     float64
 3   Access to electricity, urban (% of urban population)                                        27 non-null     float64
 4   CO2 emissions from liquid fuel consumption (kt)_x                                           32 non-null     float64
 5   Energy related methane emissions (% of total)

In [67]:
merge_test['Year'] = pd.to_datetime(merge_test['Year'])

merge_test['Year_extract'] = merge_test['Year'].apply(lambda t: t.year)

merge_test.head()

Unnamed: 0,Year,Access to electricity (% of population)_x,"Access to electricity, rural (% of rural population)","Access to electricity, urban (% of urban population)",CO2 emissions from liquid fuel consumption (kt)_x,Energy related methane emissions (% of total),Fuel exports (% of merchandise exports),Fuel imports (% of merchandise imports),Methane emissions in energy sector (thousand metric tons of CO2 equivalent),Mineral rents (% of GDP),...,Terrestrial protected areas (% of total land area),Total greenhouse gas emissions (% change from 1990),Total greenhouse gas emissions (kt of CO2 equivalent),Urban land area where elevation is below 5 meters (% of total land area),Urban land area where elevation is below 5 meters (sq. km),Urban population,Urban population (% of total population),Urban population growth (annual %),Urban population living in areas where elevation is below 5 meters (% of total population),Year_extract
0,1985-01-01,,,,3116.95,24.229499,16.268263,31.681359,4277.887717,0.000664,...,,,33349.257,,,3196035.0,16.079,4.369085,,1985
1,1986-01-01,,,,3281.965,24.428353,11.419751,18.181069,4393.634721,0.003442,...,,,34675.608,,,3336730.0,16.18,4.308031,,1986
2,1987-01-01,,,,4272.055,24.900715,13.561259,20.055016,4507.82615,0.017928,...,,,34921.399,,,3481221.0,16.281,4.239181,,1987
3,1988-01-01,,,,3868.685,24.167097,,,4625.050677,0.002784,...,,,36807.31,,,3629438.0,16.383,4.169472,,1988
4,1989-01-01,,,,4235.385,23.858318,,,4743.797113,0.001958,...,,,38381.069,,,3780849.0,16.485,4.087077,,1989


In [68]:
merge_test_edit = pd.read_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\merge_test_edit.csv')

In [69]:
merge_test_edit['Year'] = pd.to_datetime(merge_test_edit['Year'])

In [71]:
# Resampling the dataset from Yearly to Daily
new_merge = merge_test_edit.set_index('Year').resample('D').pad();
new_merge

Unnamed: 0_level_0,CO2 emissions from liquid fuel consumption (kt),Energy related methane emissions (% of total),Fuel exports (% of merchandise exports),Fuel imports (% of merchandise imports),Methane emissions in energy sector (thousand metric tons of CO2 equivalent),Mineral rents (% of GDP),Nitrous oxide emissions in energy sector (thousand metric tons of CO2 equivalent),Ores and metals exports (% of merchandise exports),Total natural resources rents (% of GDP),Agricultural land (% of land area),...,Methane emissions (kt of CO2 equivalent),"Mortality rate, under-5 (per 1,000 live births)",Nitrous oxide emissions (thousand metric tons of CO2 equivalent),Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),"Population, total",Total greenhouse gas emissions (kt of CO2 equivalent),Urban population,Year_extract,Year_Count
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1985-01-01,3116.950,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-02,3116.950,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-03,3116.950,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-04,3116.950,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-05,3116.950,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-28,13266.539,32.513605,7.737080,19.222932,11091.000000,1.004683,1431.00000,4.985782,1.052402,49.546930,...,40251.0,43.2,18331.000,2.272746,11.051462,52573967,78831.000,14461521,2019,365
2019-12-29,13266.539,32.513605,7.737080,19.222932,11091.000000,1.004683,1431.00000,4.985782,1.052402,49.546930,...,40251.0,43.2,18331.000,2.272746,11.051462,52573967,78831.000,14461521,2019,365
2019-12-30,13266.539,32.513605,7.737080,19.222932,11091.000000,1.004683,1431.00000,4.985782,1.052402,49.546930,...,40251.0,43.2,18331.000,2.272746,11.051462,52573967,78831.000,14461521,2019,365
2019-12-31,13266.539,32.513605,7.737080,19.222932,11091.000000,1.004683,1431.00000,4.985782,1.052402,49.546930,...,40251.0,43.2,18331.000,2.272746,11.051462,52573967,78831.000,14461521,2019,365


In [72]:
new = merge_test_edit.set_index('Year').resample('D').pad();
new.head()
#new=new[new.index.year != 2020]
new.info()
new.tail(40)

new['Year_extract'].unique()

df_new = new['CO2 emissions from liquid fuel consumption (kt)'].groupby(new.index.year).count()

new

Unnamed: 0_level_0,CO2 emissions from liquid fuel consumption (kt),Energy related methane emissions (% of total),Fuel exports (% of merchandise exports),Fuel imports (% of merchandise imports),Methane emissions in energy sector (thousand metric tons of CO2 equivalent),Mineral rents (% of GDP),Nitrous oxide emissions in energy sector (thousand metric tons of CO2 equivalent),Ores and metals exports (% of merchandise exports),Total natural resources rents (% of GDP),Agricultural land (% of land area),...,Methane emissions (kt of CO2 equivalent),"Mortality rate, under-5 (per 1,000 live births)",Nitrous oxide emissions (thousand metric tons of CO2 equivalent),Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),"Population, total",Total greenhouse gas emissions (kt of CO2 equivalent),Urban population,Year_extract,Year_Count
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1985-01-01,3116.95,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-02,3116.95,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-03,3116.95,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-04,3116.95,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-05,3116.95,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12784 entries, 1985-01-01 to 2020-01-01
Freq: D
Data columns (total 26 columns):
 #   Column                                                                             Non-Null Count  Dtype  
---  ------                                                                             --------------  -----  
 0   CO2 emissions from liquid fuel consumption (kt)                                    12784 non-null  float64
 1   Energy related methane emissions (% of total)                                      12784 non-null  float64
 2   Fuel exports (% of merchandise exports)                                            12784 non-null  float64
 3   Fuel imports (% of merchandise imports)                                            12784 non-null  float64
 4   Methane emissions in energy sector (thousand metric tons of CO2 equivalent)        12784 non-null  float64
 5   Mineral rents (% of GDP)                                                     

Unnamed: 0_level_0,CO2 emissions from liquid fuel consumption (kt),Energy related methane emissions (% of total),Fuel exports (% of merchandise exports),Fuel imports (% of merchandise imports),Methane emissions in energy sector (thousand metric tons of CO2 equivalent),Mineral rents (% of GDP),Nitrous oxide emissions in energy sector (thousand metric tons of CO2 equivalent),Ores and metals exports (% of merchandise exports),Total natural resources rents (% of GDP),Agricultural land (% of land area),...,Methane emissions (kt of CO2 equivalent),"Mortality rate, under-5 (per 1,000 live births)",Nitrous oxide emissions (thousand metric tons of CO2 equivalent),Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),"Population, total",Total greenhouse gas emissions (kt of CO2 equivalent),Urban population,Year_extract,Year_Count
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-11-23,13266.539,32.513605,7.73708,19.222932,11091.0,1.004683,1431.0,4.985782,1.052402,49.54693,...,40251.0,43.2,18331.0,2.272746,11.051462,52573967,78831.0,14461521,2019,365
2019-11-24,13266.539,32.513605,7.73708,19.222932,11091.0,1.004683,1431.0,4.985782,1.052402,49.54693,...,40251.0,43.2,18331.0,2.272746,11.051462,52573967,78831.0,14461521,2019,365
2019-11-25,13266.539,32.513605,7.73708,19.222932,11091.0,1.004683,1431.0,4.985782,1.052402,49.54693,...,40251.0,43.2,18331.0,2.272746,11.051462,52573967,78831.0,14461521,2019,365
2019-11-26,13266.539,32.513605,7.73708,19.222932,11091.0,1.004683,1431.0,4.985782,1.052402,49.54693,...,40251.0,43.2,18331.0,2.272746,11.051462,52573967,78831.0,14461521,2019,365
2019-11-27,13266.539,32.513605,7.73708,19.222932,11091.0,1.004683,1431.0,4.985782,1.052402,49.54693,...,40251.0,43.2,18331.0,2.272746,11.051462,52573967,78831.0,14461521,2019,365
2019-11-28,13266.539,32.513605,7.73708,19.222932,11091.0,1.004683,1431.0,4.985782,1.052402,49.54693,...,40251.0,43.2,18331.0,2.272746,11.051462,52573967,78831.0,14461521,2019,365
2019-11-29,13266.539,32.513605,7.73708,19.222932,11091.0,1.004683,1431.0,4.985782,1.052402,49.54693,...,40251.0,43.2,18331.0,2.272746,11.051462,52573967,78831.0,14461521,2019,365
2019-11-30,13266.539,32.513605,7.73708,19.222932,11091.0,1.004683,1431.0,4.985782,1.052402,49.54693,...,40251.0,43.2,18331.0,2.272746,11.051462,52573967,78831.0,14461521,2019,365
2019-12-01,13266.539,32.513605,7.73708,19.222932,11091.0,1.004683,1431.0,4.985782,1.052402,49.54693,...,40251.0,43.2,18331.0,2.272746,11.051462,52573967,78831.0,14461521,2019,365
2019-12-02,13266.539,32.513605,7.73708,19.222932,11091.0,1.004683,1431.0,4.985782,1.052402,49.54693,...,40251.0,43.2,18331.0,2.272746,11.051462,52573967,78831.0,14461521,2019,365


array([1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995,
       1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
       2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,
       2018, 2019, 2020], dtype=int64)

Unnamed: 0_level_0,CO2 emissions from liquid fuel consumption (kt),Energy related methane emissions (% of total),Fuel exports (% of merchandise exports),Fuel imports (% of merchandise imports),Methane emissions in energy sector (thousand metric tons of CO2 equivalent),Mineral rents (% of GDP),Nitrous oxide emissions in energy sector (thousand metric tons of CO2 equivalent),Ores and metals exports (% of merchandise exports),Total natural resources rents (% of GDP),Agricultural land (% of land area),...,Methane emissions (kt of CO2 equivalent),"Mortality rate, under-5 (per 1,000 live births)",Nitrous oxide emissions (thousand metric tons of CO2 equivalent),Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),"Population, total",Total greenhouse gas emissions (kt of CO2 equivalent),Urban population,Year_extract,Year_Count
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1985-01-01,3116.950,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-02,3116.950,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-03,3116.950,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-04,3116.950,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-05,3116.950,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-28,13266.539,32.513605,7.737080,19.222932,11091.000000,1.004683,1431.00000,4.985782,1.052402,49.546930,...,40251.0,43.2,18331.000,2.272746,11.051462,52573967,78831.000,14461521,2019,365
2019-12-29,13266.539,32.513605,7.737080,19.222932,11091.000000,1.004683,1431.00000,4.985782,1.052402,49.546930,...,40251.0,43.2,18331.000,2.272746,11.051462,52573967,78831.000,14461521,2019,365
2019-12-30,13266.539,32.513605,7.737080,19.222932,11091.000000,1.004683,1431.00000,4.985782,1.052402,49.546930,...,40251.0,43.2,18331.000,2.272746,11.051462,52573967,78831.000,14461521,2019,365
2019-12-31,13266.539,32.513605,7.737080,19.222932,11091.000000,1.004683,1431.00000,4.985782,1.052402,49.546930,...,40251.0,43.2,18331.000,2.272746,11.051462,52573967,78831.000,14461521,2019,365


In [73]:
new_merge_edit = new_merge[new_merge['Year_extract'] != 2020]
new_merge_edit

Unnamed: 0_level_0,CO2 emissions from liquid fuel consumption (kt),Energy related methane emissions (% of total),Fuel exports (% of merchandise exports),Fuel imports (% of merchandise imports),Methane emissions in energy sector (thousand metric tons of CO2 equivalent),Mineral rents (% of GDP),Nitrous oxide emissions in energy sector (thousand metric tons of CO2 equivalent),Ores and metals exports (% of merchandise exports),Total natural resources rents (% of GDP),Agricultural land (% of land area),...,Methane emissions (kt of CO2 equivalent),"Mortality rate, under-5 (per 1,000 live births)",Nitrous oxide emissions (thousand metric tons of CO2 equivalent),Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),"Population, total",Total greenhouse gas emissions (kt of CO2 equivalent),Urban population,Year_extract,Year_Count
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1985-01-01,3116.950,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-02,3116.950,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-03,3116.950,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-04,3116.950,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1985-01-05,3116.950,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-27,13266.539,32.513605,7.737080,19.222932,11091.000000,1.004683,1431.00000,4.985782,1.052402,49.546930,...,40251.0,43.2,18331.000,2.272746,11.051462,52573967,78831.000,14461521,2019,365
2019-12-28,13266.539,32.513605,7.737080,19.222932,11091.000000,1.004683,1431.00000,4.985782,1.052402,49.546930,...,40251.0,43.2,18331.000,2.272746,11.051462,52573967,78831.000,14461521,2019,365
2019-12-29,13266.539,32.513605,7.737080,19.222932,11091.000000,1.004683,1431.00000,4.985782,1.052402,49.546930,...,40251.0,43.2,18331.000,2.272746,11.051462,52573967,78831.000,14461521,2019,365
2019-12-30,13266.539,32.513605,7.737080,19.222932,11091.000000,1.004683,1431.00000,4.985782,1.052402,49.546930,...,40251.0,43.2,18331.000,2.272746,11.051462,52573967,78831.000,14461521,2019,365


In [74]:
merge_test_edit.info()
merge_test_edit = merge_test_edit[merge_test_edit['Year_extract'] != 2020]
merge_test_edit
test = []

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 27 columns):
 #   Column                                                                             Non-Null Count  Dtype         
---  ------                                                                             --------------  -----         
 0   Year                                                                               36 non-null     datetime64[ns]
 1   CO2 emissions from liquid fuel consumption (kt)                                    36 non-null     float64       
 2   Energy related methane emissions (% of total)                                      36 non-null     float64       
 3   Fuel exports (% of merchandise exports)                                            36 non-null     float64       
 4   Fuel imports (% of merchandise imports)                                            36 non-null     float64       
 5   Methane emissions in energy sector (thousand metric tons of

Unnamed: 0,Year,CO2 emissions from liquid fuel consumption (kt),Energy related methane emissions (% of total),Fuel exports (% of merchandise exports),Fuel imports (% of merchandise imports),Methane emissions in energy sector (thousand metric tons of CO2 equivalent),Mineral rents (% of GDP),Nitrous oxide emissions in energy sector (thousand metric tons of CO2 equivalent),Ores and metals exports (% of merchandise exports),Total natural resources rents (% of GDP),...,Methane emissions (kt of CO2 equivalent),"Mortality rate, under-5 (per 1,000 live births)",Nitrous oxide emissions (thousand metric tons of CO2 equivalent),Population growth (annual %),Population in urban agglomerations of more than 1 million (% of total population),"Population, total",Total greenhouse gas emissions (kt of CO2 equivalent),Urban population,Year_extract,Year_Count
0,1985-01-01,3116.95,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,...,17655.7,97.2,7956.987,3.745234,7.532158,19877078,33349.257,3196035,1985,365
1,1986-01-01,3281.965,24.428353,11.419751,18.181069,4393.634721,0.003442,375.435773,2.050675,4.1205,...,17985.8,96.3,8221.448,3.681841,7.574957,20622560,34675.608,3336730,1986,365
2,1987-01-01,4272.055,24.900715,13.561259,20.055016,4507.82615,0.017928,386.12076,2.597217,3.905213,...,18103.2,96.3,8209.079,3.616902,7.623368,21382111,34921.399,3481221,1987,365
3,1988-01-01,3868.685,24.167097,13.407958,19.997322,4625.050677,0.002784,396.60011,2.699771,3.915231,...,19137.8,97.2,8815.16,3.544921,7.678479,22153685,36807.31,3629438,1988,366
4,1989-01-01,4235.385,23.858318,13.254658,19.939629,4743.797113,0.001958,411.688596,2.802326,4.189091,...,19883.2,99.0,9127.609,3.466412,7.739582,22935088,38381.069,3780849,1989,365
5,1990-01-01,4668.091,22.114497,13.101358,19.881935,4790.0,0.003664,590.0,2.90488,4.963899,...,21660.0,101.5,10670.0,3.384342,7.824554,23724574,38490.0,3973392,1990,365
6,1991-01-01,3777.01,23.190476,16.787404,18.622451,4870.0,0.001388,600.0,3.306119,5.198748,...,21000.0,104.3,10330.0,3.304762,7.919813,24521714,37250.0,4179236,1991,365
7,1992-01-01,4367.397,23.568702,11.315511,25.153406,4940.0,0.000731,610.0,2.370971,5.377811,...,20960.0,107.2,10290.0,3.227568,8.023129,25326080,37250.0,4392049,1992,366
8,1993-01-01,5291.481,23.936933,9.37528,14.922103,5010.0,0.009899,630.0,2.880214,6.771411,...,20930.0,109.4,10200.0,3.148728,8.133396,26136217,37050.0,4611735,1993,365
9,1994-01-01,5540.837,24.133017,4.122781,12.529369,5080.0,0.010335,640.0,2.604813,6.337798,...,21050.0,110.8,10340.0,3.068017,8.252561,26950508,37310.0,4838155,1994,365


In [75]:
list_var = ['CO2 emissions from liquid fuel consumption (kt)',
       'Energy related methane emissions (% of total)',
       'Fuel exports (% of merchandise exports)',
       'Fuel imports (% of merchandise imports)',
       'Methane emissions in energy sector (thousand metric tons of CO2 equivalent)',
       'Mineral rents (% of GDP)',
       'Nitrous oxide emissions in energy sector (thousand metric tons of CO2 equivalent)',
       'Ores and metals exports (% of merchandise exports)',
       'Total natural resources rents (% of GDP)',
       'Agricultural land (% of land area)', 'Agricultural land (sq. km)',
       'Agriculture, forestry, and fishing, value added (% of GDP)',
       'Arable land (% of land area)', 'CO2 emissions (kt)',
       'CO2 emissions (metric tons per capita)',
       'Foreign direct investment, net inflows (% of GDP)',
       'Methane emissions (kt of CO2 equivalent)',
       'Mortality rate, under-5 (per 1,000 live births)',
       'Nitrous oxide emissions (thousand metric tons of CO2 equivalent)',
       'Population growth (annual %)',
       'Population in urban agglomerations of more than 1 million (% of total population)',
       'Population, total',
       'Total greenhouse gas emissions (kt of CO2 equivalent)',
       'Urban population']

In [76]:
for columns in list_var:
    col_name = columns+'_new'
    subset_df = merge_test_edit[[columns, 'Year_Count']]
    test=[]
    for i, row in subset_df.iterrows():
        #print(row[columns], row['Year_Count'])
        group = np.random.normal(loc=row[columns], scale=2, size=int(row['Year_Count']))
        test.extend(group)
    new_merge_edit[col_name]=test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_merge_edit[col_name]=test


In [77]:
new_merge_edit.to_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\new_merge_edit.csv')

In [78]:
new_merge_edit.head(); new_merge_edit.columns

Unnamed: 0_level_0,CO2 emissions from liquid fuel consumption (kt),Energy related methane emissions (% of total),Fuel exports (% of merchandise exports),Fuel imports (% of merchandise imports),Methane emissions in energy sector (thousand metric tons of CO2 equivalent),Mineral rents (% of GDP),Nitrous oxide emissions in energy sector (thousand metric tons of CO2 equivalent),Ores and metals exports (% of merchandise exports),Total natural resources rents (% of GDP),Agricultural land (% of land area),...,CO2 emissions (metric tons per capita)_new,"Foreign direct investment, net inflows (% of GDP)_new",Methane emissions (kt of CO2 equivalent)_new,"Mortality rate, under-5 (per 1,000 live births)_new",Nitrous oxide emissions (thousand metric tons of CO2 equivalent)_new,Population growth (annual %)_new,Population in urban agglomerations of more than 1 million (% of total population)_new,"Population, total_new",Total greenhouse gas emissions (kt of CO2 equivalent)_new,Urban population_new
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1985-01-01,3116.95,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,0.437018,0.923588,17657.133637,96.782995,7955.981916,1.689694,1.915901,19877070.0,33352.033633,3196038.0
1985-01-02,3116.95,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,-1.868369,1.471601,17654.146536,95.662664,7955.300605,0.133625,10.440757,19877080.0,33351.341251,3196036.0
1985-01-03,3116.95,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,-0.068019,-0.191795,17657.491893,97.957271,7953.785986,6.497908,11.031042,19877080.0,33348.449683,3196036.0
1985-01-04,3116.95,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,-0.616678,-0.349419,17654.334317,101.462477,7952.455543,5.496535,7.806705,19877080.0,33349.542086,3196033.0
1985-01-05,3116.95,24.229499,16.268263,31.681359,4277.887717,0.000664,361.91036,2.445062,3.248726,46.304951,...,-1.877565,-1.747107,17654.37758,101.075995,7960.853469,2.209685,4.26236,19877080.0,33350.253949,3196037.0


Index(['CO2 emissions from liquid fuel consumption (kt)',
       'Energy related methane emissions (% of total)',
       'Fuel exports (% of merchandise exports)',
       'Fuel imports (% of merchandise imports)',
       'Methane emissions in energy sector (thousand metric tons of CO2 equivalent)',
       'Mineral rents (% of GDP)',
       'Nitrous oxide emissions in energy sector (thousand metric tons of CO2 equivalent)',
       'Ores and metals exports (% of merchandise exports)',
       'Total natural resources rents (% of GDP)',
       'Agricultural land (% of land area)', 'Agricultural land (sq. km)',
       'Agriculture, forestry, and fishing, value added (% of GDP)',
       'Arable land (% of land area)', 'CO2 emissions (kt)',
       'CO2 emissions (metric tons per capita)',
       'Foreign direct investment, net inflows (% of GDP)',
       'Methane emissions (kt of CO2 equivalent)',
       'Mortality rate, under-5 (per 1,000 live births)',
       'Nitrous oxide emissions (thous

In [79]:
new_merge_edit.drop(['CO2 emissions from liquid fuel consumption (kt)',
       'Energy related methane emissions (% of total)',
       'Fuel exports (% of merchandise exports)',
       'Fuel imports (% of merchandise imports)',
       'Methane emissions in energy sector (thousand metric tons of CO2 equivalent)',
       'Mineral rents (% of GDP)',
       'Nitrous oxide emissions in energy sector (thousand metric tons of CO2 equivalent)',
       'Ores and metals exports (% of merchandise exports)',
       'Total natural resources rents (% of GDP)',
       'Agricultural land (% of land area)', 'Agricultural land (sq. km)',
       'Agriculture, forestry, and fishing, value added (% of GDP)',
       'Arable land (% of land area)', 'CO2 emissions (kt)',
       'CO2 emissions (metric tons per capita)',
       'Foreign direct investment, net inflows (% of GDP)',
       'Methane emissions (kt of CO2 equivalent)',
       'Mortality rate, under-5 (per 1,000 live births)',
       'Nitrous oxide emissions (thousand metric tons of CO2 equivalent)',
       'Population growth (annual %)',
       'Population in urban agglomerations of more than 1 million (% of total population)',
       'Population, total',
       'Total greenhouse gas emissions (kt of CO2 equivalent)',
       'Urban population'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [80]:
new_merge_edit.reset_index(level=0, inplace=True)

In [81]:
new_merge_edit.drop(['Year_Count'], axis=1, inplace=True); new_merge_edit.head(); new_merge_edit.columns; new_merge_edit.info()

Unnamed: 0,Year,Year_extract,CO2 emissions from liquid fuel consumption (kt)_new,Energy related methane emissions (% of total)_new,Fuel exports (% of merchandise exports)_new,Fuel imports (% of merchandise imports)_new,Methane emissions in energy sector (thousand metric tons of CO2 equivalent)_new,Mineral rents (% of GDP)_new,Nitrous oxide emissions in energy sector (thousand metric tons of CO2 equivalent)_new,Ores and metals exports (% of merchandise exports)_new,...,CO2 emissions (metric tons per capita)_new,"Foreign direct investment, net inflows (% of GDP)_new",Methane emissions (kt of CO2 equivalent)_new,"Mortality rate, under-5 (per 1,000 live births)_new",Nitrous oxide emissions (thousand metric tons of CO2 equivalent)_new,Population growth (annual %)_new,Population in urban agglomerations of more than 1 million (% of total population)_new,"Population, total_new",Total greenhouse gas emissions (kt of CO2 equivalent)_new,Urban population_new
0,1985-01-01,1985,3115.500072,22.296694,17.344988,27.385702,4274.59787,0.189132,362.269054,5.832286,...,0.437018,0.923588,17657.133637,96.782995,7955.981916,1.689694,1.915901,19877070.0,33352.033633,3196038.0
1,1985-01-02,1985,3113.773235,24.652801,17.87235,34.392681,4275.776764,1.363553,362.467593,3.231171,...,-1.868369,1.471601,17654.146536,95.662664,7955.300605,0.133625,10.440757,19877080.0,33351.341251,3196036.0
2,1985-01-03,1985,3117.45308,23.997819,15.646974,31.639511,4278.109493,0.265509,363.018825,0.921773,...,-0.068019,-0.191795,17657.491893,97.957271,7953.785986,6.497908,11.031042,19877080.0,33348.449683,3196036.0
3,1985-01-04,1985,3116.046568,26.25373,15.759304,30.3382,4275.847062,1.492468,365.042283,6.825567,...,-0.616678,-0.349419,17654.334317,101.462477,7952.455543,5.496535,7.806705,19877080.0,33349.542086,3196033.0
4,1985-01-05,1985,3118.102532,24.893927,11.565153,31.588374,4280.60782,0.245933,360.890276,-0.076119,...,-1.877565,-1.747107,17654.37758,101.075995,7960.853469,2.209685,4.26236,19877080.0,33350.253949,3196037.0


Index(['Year', 'Year_extract',
       'CO2 emissions from liquid fuel consumption (kt)_new',
       'Energy related methane emissions (% of total)_new',
       'Fuel exports (% of merchandise exports)_new',
       'Fuel imports (% of merchandise imports)_new',
       'Methane emissions in energy sector (thousand metric tons of CO2 equivalent)_new',
       'Mineral rents (% of GDP)_new',
       'Nitrous oxide emissions in energy sector (thousand metric tons of CO2 equivalent)_new',
       'Ores and metals exports (% of merchandise exports)_new',
       'Total natural resources rents (% of GDP)_new',
       'Agricultural land (% of land area)_new',
       'Agricultural land (sq. km)_new',
       'Agriculture, forestry, and fishing, value added (% of GDP)_new',
       'Arable land (% of land area)_new', 'CO2 emissions (kt)_new',
       'CO2 emissions (metric tons per capita)_new',
       'Foreign direct investment, net inflows (% of GDP)_new',
       'Methane emissions (kt of CO2 equivale

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12783 entries, 0 to 12782
Data columns (total 26 columns):
 #   Column                                                                                 Non-Null Count  Dtype         
---  ------                                                                                 --------------  -----         
 0   Year                                                                                   12783 non-null  datetime64[ns]
 1   Year_extract                                                                           12783 non-null  int64         
 2   CO2 emissions from liquid fuel consumption (kt)_new                                    12783 non-null  float64       
 3   Energy related methane emissions (% of total)_new                                      12783 non-null  float64       
 4   Fuel exports (% of merchandise exports)_new                                            12783 non-null  float64       
 5   Fuel imports (% of mercha

In [84]:
new_merge_edit_final = new_merge_edit.loc[(new_merge_edit['Year']>'31/08/1985')]

In [85]:
new_merge_edit_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12540 entries, 243 to 12782
Data columns (total 26 columns):
 #   Column                                                                                 Non-Null Count  Dtype         
---  ------                                                                                 --------------  -----         
 0   Year                                                                                   12540 non-null  datetime64[ns]
 1   Year_extract                                                                           12540 non-null  int64         
 2   CO2 emissions from liquid fuel consumption (kt)_new                                    12540 non-null  float64       
 3   Energy related methane emissions (% of total)_new                                      12540 non-null  float64       
 4   Fuel exports (% of merchandise exports)_new                                            12540 non-null  float64       
 5   Fuel imports (% of merc

In [87]:
new_merge_edit_final.to_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\new_merge_final.csv')

In [88]:
new_merge_edit_final['Year'].describe()

  new_merge_edit_final['Year'].describe()


count                   12540
unique                  12540
top       1989-12-27 00:00:00
freq                        1
first     1985-09-01 00:00:00
last      2019-12-31 00:00:00
Name: Year, dtype: object

In [89]:
nairobi['timestamp'] = pd.to_datetime(nairobi['timestamp'])
nairobi.info(); nairobi.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13150 entries, 0 to 13149
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   timestamp                      13150 non-null  datetime64[ns]
 1   N_Temperature_m                13150 non-null  float64       
 2   N_Precipitation_Total          13150 non-null  float64       
 3   N_Snowfall_Amount              13150 non-null  int64         
 4   N_Relative_Humidity_m          13150 non-null  int64         
 5   N_Wind_Speed_10m               13150 non-null  float64       
 6   N_Cloud_Cover_Total            13150 non-null  float64       
 7   N_Sunshine_Duration            13150 non-null  float64       
 8   N_Shortwave_Radiation          13150 non-null  float64       
 9   N_Mean_Sea_Level_Pressure      13150 non-null  float64       
 10  N_Soil_Temperature_10cm        13150 non-null  float64       
 11  N_Soil_Moisture

Unnamed: 0,timestamp,N_Temperature_m,N_Precipitation_Total,N_Snowfall_Amount,N_Relative_Humidity_m,N_Wind_Speed_10m,N_Cloud_Cover_Total,N_Sunshine_Duration,N_Shortwave_Radiation,N_Mean_Sea_Level_Pressure,N_Soil_Temperature_10cm,N_Soil_Moisture_10cm,N_Wind_Direction_Dominant_10m,Year_extract
0,1985-09-01,69.62904,0.0,0,84,10.398747,93.291664,95.106384,3090.08,1017.5,74.138,0.172,112.53505,1985
1,1985-09-02,71.60905,0.0,0,86,11.937781,86.625,188.29787,3262.7397,1017.8,75.27202,0.172,110.25738,1985
2,1985-09-03,71.64504,0.0,0,88,10.664815,96.625,47.872337,3485.24,1018.3,76.26199,0.193,108.77132,1985
3,1985-09-04,70.79904,0.0,0,85,8.518,94.166664,81.702126,2620.1597,1018.6,73.52602,0.202,107.01516,1985
4,1985-09-05,73.30104,0.0,0,86,10.187275,68.375,426.11063,4981.33,1017.1,78.72798,0.202,106.76452,1985


In [90]:
nairobi_edit = nairobi.loc[(nairobi['timestamp']>'31/08/1985') & (nairobi['timestamp']<'01/01/2020')]
nairobi_edit['timestamp'].describe()

  nairobi_edit['timestamp'].describe()


count                   12540
unique                  12540
top       1989-12-27 00:00:00
freq                        1
first     1985-09-01 00:00:00
last      2019-12-31 00:00:00
Name: timestamp, dtype: object

In [91]:
algiers['timestamp'] = pd.to_datetime(algiers['timestamp']);
algiers_edit = algiers.loc[(algiers['timestamp']>'31/08/1985') & (algiers['timestamp']<'01/01/2020')]
algiers_edit['timestamp'].describe()

  algiers_edit['timestamp'].describe()


count                   12540
unique                  12540
top       1989-12-27 00:00:00
freq                        1
first     1985-09-01 00:00:00
last      2019-12-31 00:00:00
Name: timestamp, dtype: object

In [92]:
cape_town['timestamp'] = pd.to_datetime(cape_town['timestamp']);
cape_town_edit = cape_town.loc[(cape_town['timestamp']>'31/08/1985') & (cape_town['timestamp']<'01/01/2020')]
cape_town_edit['timestamp'].describe()

  cape_town_edit['timestamp'].describe()


count                   12540
unique                  12540
top       1989-12-27 00:00:00
freq                        1
first     1985-09-01 00:00:00
last      2019-12-31 00:00:00
Name: timestamp, dtype: object

In [93]:
afr_merge_1 = pd.merge(nairobi_edit, algiers_edit, how="left", on="timestamp")
afr_merge_2 = pd.merge(afr_merge_1, cape_town, how="left", on="timestamp")
afr_merge_2.head(); afr_merge_2.info()

Unnamed: 0,timestamp,N_Temperature_m,N_Precipitation_Total,N_Snowfall_Amount,N_Relative_Humidity_m,N_Wind_Speed_10m,N_Cloud_Cover_Total,N_Sunshine_Duration,N_Shortwave_Radiation,N_Mean_Sea_Level_Pressure,...,CT_Relative_Humidity_m,CT_Wind_Speed_10m,CT_Cloud_Cover_Total,CT_Sunshine_Duration,CT_Shortwave_Radiation,CT_Mean_Sea_Level_Pressure,CT_Soil_Temperature_10cm,CT_Soil_Moisture_10cm,CT_Wind_Direction_Dominant_10m,Year_extract_y
0,1985-09-01,69.62904,0.0,0,84,10.398747,93.291664,95.106384,3090.08,1017.5,...,99,4.457064,15.708333,675.83484,5135.3003,1017.0,75.56002,0.145,292.47943,1985
1,1985-09-02,71.60905,0.0,0,86,11.937781,86.625,188.29787,3262.7397,1017.8,...,96,7.161691,93.0,95.744675,4807.7803,1015.1,67.046005,0.141,288.88968,1985
2,1985-09-03,71.64504,0.0,0,88,10.664815,96.625,47.872337,3485.24,1018.3,...,92,8.108701,65.666664,105.773056,2403.0,1020.8,62.636032,0.137,305.56384,1985
3,1985-09-04,70.79904,0.0,0,85,8.518,94.166664,81.702126,2620.1597,1018.6,...,90,6.18706,52.82083,379.96277,5029.3896,1021.2,67.36999,0.14,245.22487,1985
4,1985-09-05,73.30104,0.0,0,86,10.187275,68.375,426.11063,4981.33,1017.1,...,100,7.644957,49.208332,423.07584,5007.1406,1023.0,69.94403,0.14,262.43716,1985


<class 'pandas.core.frame.DataFrame'>
Int64Index: 12540 entries, 0 to 12539
Data columns (total 39 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   timestamp                       12540 non-null  datetime64[ns]
 1   N_Temperature_m                 12540 non-null  float64       
 2   N_Precipitation_Total           12540 non-null  float64       
 3   N_Snowfall_Amount               12540 non-null  int64         
 4   N_Relative_Humidity_m           12540 non-null  int64         
 5   N_Wind_Speed_10m                12540 non-null  float64       
 6   N_Cloud_Cover_Total             12540 non-null  float64       
 7   N_Sunshine_Duration             12540 non-null  float64       
 8   N_Shortwave_Radiation           12540 non-null  float64       
 9   N_Mean_Sea_Level_Pressure       12540 non-null  float64       
 10  N_Soil_Temperature_10cm         12540 non-null  float64       
 11  N_

In [94]:
new_merge_edit_final = new_merge_edit_final.rename(columns={"Year":"timestamp"})
new_merge_edit_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12540 entries, 243 to 12782
Data columns (total 26 columns):
 #   Column                                                                                 Non-Null Count  Dtype         
---  ------                                                                                 --------------  -----         
 0   timestamp                                                                              12540 non-null  datetime64[ns]
 1   Year_extract                                                                           12540 non-null  int64         
 2   CO2 emissions from liquid fuel consumption (kt)_new                                    12540 non-null  float64       
 3   Energy related methane emissions (% of total)_new                                      12540 non-null  float64       
 4   Fuel exports (% of merchandise exports)_new                                            12540 non-null  float64       
 5   Fuel imports (% of merc

In [95]:
table_final = pd.merge(new_merge_edit_final, afr_merge_2, how="left", on='timestamp')

In [96]:
table_final.to_csv(r'C:\Users\moham\OneDrive\Desktop\Other\Bootcamp\Capstone_2\ProjectIdeas_Initial\Processed_Cleaned\table_final.csv')