# CO2 and GDP datasets Merging, Processing and EDA
This notebook merges and processes the GDP and CO2 countries' datasets contained in `..\data\processed`.

## Step 0: Import and Read Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
pd.set_option('display.max_columns', 100)

In [2]:
df_gdp = pd.read_csv('../data/processed//gdp_countries.csv')
df_co2_emissions = pd.read_csv('../data/processed/co2_emissions_countries.csv')

 ---

## Step 1: Data Preparation
Quick inspection

### 1.1 GDP Dataframe

In [3]:
# df_gdp.info()

In [4]:
df_gdp.head()

Unnamed: 0,Country Name,Country Code,Year,GDP,GDP per capita,GDP growth (annual %),Continent
0,Afghanistan,AFG,2000,6206548000.0,1617.826475,,Asia
1,Afghanistan,AFG,2001,5621148000.0,1454.110782,-9.431974,Asia
2,Afghanistan,AFG,2002,7228796000.0,1774.308743,28.600001,Asia
3,Afghanistan,AFG,2003,7867263000.0,1815.9282,8.832278,Asia
4,Afghanistan,AFG,2004,7978516000.0,1776.918207,1.414118,Asia


### 1.2 CO2 emissions Dataframe

In [5]:
# df_co2_emissions.info()

In [6]:
df_co2_emissions.head()

Unnamed: 0,Entity,Code,Year,CO2 emissions per capita,Total CO2 emissions
0,Afghanistan,AFG,1949,0.001992,14656.0
1,Afghanistan,AFG,1950,0.010837,84272.0
2,Afghanistan,AFG,1951,0.011625,91600.0
3,Afghanistan,AFG,1952,0.011468,91600.0
4,Afghanistan,AFG,1953,0.013123,106256.0


 ---
### 1.3 Dataset merging
Merge the two datasets into `df_gdp_co2_merged`.

In [7]:
df_gdp_co2_merged = pd.merge(df_gdp, df_co2_emissions, 
                    left_on=['Country Code', 'Year'],
                    right_on=['Code', 'Year'],
                    how='inner',
                    suffixes=('_gdp', '_co2'))
df_gdp_co2_merged.info()

<class 'pandas.DataFrame'>
RangeIndex: 10949 entries, 0 to 10948
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Country Name              10949 non-null  str    
 1   Country Code              10949 non-null  str    
 2   Year                      10949 non-null  int64  
 3   GDP                       10868 non-null  float64
 4   GDP per capita            6658 non-null   float64
 5   GDP growth (annual %)     10759 non-null  float64
 6   Continent                 10631 non-null  str    
 7   Entity                    10949 non-null  str    
 8   Code                      10949 non-null  str    
 9   CO2 emissions per capita  10949 non-null  float64
 10  Total CO2 emissions       10949 non-null  float64
dtypes: float64(5), int64(1), str(5)
memory usage: 941.1 KB


In [8]:
df_gdp_co2_merged.head(10)

Unnamed: 0,Country Name,Country Code,Year,GDP,GDP per capita,GDP growth (annual %),Continent,Entity,Code,CO2 emissions per capita,Total CO2 emissions
0,Afghanistan,AFG,2000,6206548000.0,1617.826475,,Asia,Afghanistan,AFG,0.052017,1047127.94
1,Afghanistan,AFG,2001,5621148000.0,1454.110782,-9.431974,Asia,Afghanistan,AFG,0.052706,1069098.0
2,Afghanistan,AFG,2002,7228796000.0,1774.308743,28.600001,Asia,Afghanistan,AFG,0.062731,1341065.0
3,Afghanistan,AFG,2003,7867263000.0,1815.9282,8.832278,Asia,Afghanistan,AFG,0.068608,1559679.0
4,Afghanistan,AFG,2004,7978516000.0,1776.918207,1.414118,Asia,Afghanistan,AFG,0.052513,1237247.0
5,Afghanistan,AFG,2005,8874480000.0,1908.114782,11.229715,Asia,Afghanistan,AFG,0.077424,1889507.0
6,Afghanistan,AFG,2006,9349922000.0,1929.723897,5.357403,Asia,Afghanistan,AFG,0.084932,2159318.0
7,Afghanistan,AFG,2007,10642670000.0,2155.353068,13.82632,Asia,Afghanistan,AFG,0.108063,2799909.0
8,Afghanistan,AFG,2008,11060400000.0,2191.504356,3.924984,Asia,Afghanistan,AFG,0.160652,4254490.0
9,Afghanistan,AFG,2009,13426270000.0,2565.022086,21.390528,Asia,Afghanistan,AFG,0.232586,6388232.0


Check country names

In [9]:
unique_country_names = df_gdp_co2_merged['Country Name'].unique()
unique_entity_names = df_gdp_co2_merged['Entity'].unique()

difference = list(set(unique_entity_names) - set(unique_country_names))
print("Entity Names that are not Country Names:", difference)

difference = list(set(unique_country_names) - set(unique_entity_names))
print("Country Names that are not Entity Names:", difference)


Entity Names that are not Country Names: ['Saint Vincent and the Grenadines', 'Saint Lucia', 'Micronesia (country)', 'Slovakia', 'South Korea', 'Turkey', 'Macao', 'Yemen', 'Kyrgyzstan', 'Egypt', 'Saint Kitts and Nevis', 'Cape Verde', 'Somalia', 'Bahamas', 'Hong Kong', 'Gambia', 'Congo', 'Russia', 'Brunei', 'Laos', 'Palestine', 'Democratic Republic of Congo', 'East Timor', 'Venezuela', 'Syria', 'Iran', 'Vietnam']
Country Names that are not Entity Names: ['Lao PDR', 'Macao SAR, China', 'St. Kitts and Nevis', 'Gambia, The', 'Yemen, Rep.', 'St. Lucia', 'Congo, Rep.', 'Congo, Dem. Rep.', 'West Bank and Gaza', 'Turkiye', 'St. Vincent and the Grenadines', 'Hong Kong SAR, China', 'Kyrgyz Republic', 'Cabo Verde', 'Syrian Arab Republic', 'Timor-Leste', 'Micronesia, Fed. Sts.', 'Russian Federation', 'Egypt, Arab Rep.', 'Brunei Darussalam', 'Korea, Rep.', 'Iran, Islamic Rep.', 'Slovak Republic', 'Viet Nam', 'Somalia, Fed. Rep.', 'Venezuela, RB', 'Bahamas, The']


Keep Entity names since they are cleaner

In [10]:
df_gdp_co2_merged.drop(columns=['Country Name', 'Code'], inplace=True)
df_gdp_co2_merged.rename(columns={'Entity': 'Country Name'}, inplace=True)
df_gdp_co2_merged.head(10)

Unnamed: 0,Country Code,Year,GDP,GDP per capita,GDP growth (annual %),Continent,Country Name,CO2 emissions per capita,Total CO2 emissions
0,AFG,2000,6206548000.0,1617.826475,,Asia,Afghanistan,0.052017,1047127.94
1,AFG,2001,5621148000.0,1454.110782,-9.431974,Asia,Afghanistan,0.052706,1069098.0
2,AFG,2002,7228796000.0,1774.308743,28.600001,Asia,Afghanistan,0.062731,1341065.0
3,AFG,2003,7867263000.0,1815.9282,8.832278,Asia,Afghanistan,0.068608,1559679.0
4,AFG,2004,7978516000.0,1776.918207,1.414118,Asia,Afghanistan,0.052513,1237247.0
5,AFG,2005,8874480000.0,1908.114782,11.229715,Asia,Afghanistan,0.077424,1889507.0
6,AFG,2006,9349922000.0,1929.723897,5.357403,Asia,Afghanistan,0.084932,2159318.0
7,AFG,2007,10642670000.0,2155.353068,13.82632,Asia,Afghanistan,0.108063,2799909.0
8,AFG,2008,11060400000.0,2191.504356,3.924984,Asia,Afghanistan,0.160652,4254490.0
9,AFG,2009,13426270000.0,2565.022086,21.390528,Asia,Afghanistan,0.232586,6388232.0


Reorder columns

In [11]:
df_gdp_co2_merged.info()

<class 'pandas.DataFrame'>
RangeIndex: 10949 entries, 0 to 10948
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Country Code              10949 non-null  str    
 1   Year                      10949 non-null  int64  
 2   GDP                       10868 non-null  float64
 3   GDP per capita            6658 non-null   float64
 4   GDP growth (annual %)     10759 non-null  float64
 5   Continent                 10631 non-null  str    
 6   Country Name              10949 non-null  str    
 7   CO2 emissions per capita  10949 non-null  float64
 8   Total CO2 emissions       10949 non-null  float64
dtypes: float64(5), int64(1), str(3)
memory usage: 770.0 KB


In [12]:
df_gdp_co2_merged = df_gdp_co2_merged.loc[:, ['Country Name', 
                                              'Country Code', 
                                              'Continent',
                                              'Year', 
                                              'Total CO2 emissions',
                                              'CO2 emissions per capita',
                                              'GDP', 
                                              'GDP per capita',
                                              'GDP growth (annual %)']]
df_gdp_co2_merged.head(10)

Unnamed: 0,Country Name,Country Code,Continent,Year,Total CO2 emissions,CO2 emissions per capita,GDP,GDP per capita,GDP growth (annual %)
0,Afghanistan,AFG,Asia,2000,1047127.94,0.052017,6206548000.0,1617.826475,
1,Afghanistan,AFG,Asia,2001,1069098.0,0.052706,5621148000.0,1454.110782,-9.431974
2,Afghanistan,AFG,Asia,2002,1341065.0,0.062731,7228796000.0,1774.308743,28.600001
3,Afghanistan,AFG,Asia,2003,1559679.0,0.068608,7867263000.0,1815.9282,8.832278
4,Afghanistan,AFG,Asia,2004,1237247.0,0.052513,7978516000.0,1776.918207,1.414118
5,Afghanistan,AFG,Asia,2005,1889507.0,0.077424,8874480000.0,1908.114782,11.229715
6,Afghanistan,AFG,Asia,2006,2159318.0,0.084932,9349922000.0,1929.723897,5.357403
7,Afghanistan,AFG,Asia,2007,2799909.0,0.108063,10642670000.0,2155.353068,13.82632
8,Afghanistan,AFG,Asia,2008,4254490.0,0.160652,11060400000.0,2191.504356,3.924984
9,Afghanistan,AFG,Asia,2009,6388232.0,0.232586,13426270000.0,2565.022086,21.390528


 ---
Generate dataset with only the 50 most powerful countries in 2024


In [13]:
df_gdp_co2_top40_2024 = df_gdp_co2_merged[df_gdp_co2_merged['Year'] == 2024].nlargest(40, 'GDP')
df_gdp_co2_top40_2024.reset_index(drop=True, inplace=True)
df_gdp_co2_top40_2024['Country Name'].values

<StringArray>
[       'United States',                'China',                'Japan',
              'Germany',                'India',       'United Kingdom',
               'France',                'Italy',               'Brazil',
          'South Korea',               'Canada',            'Australia',
               'Russia',                'Spain',               'Mexico',
               'Turkey',            'Indonesia',          'Netherlands',
         'Saudi Arabia',          'Switzerland',               'Poland',
            'Argentina',               'Sweden',              'Nigeria',
                 'Iran',              'Belgium',              'Ireland',
                'Egypt',             'Thailand', 'United Arab Emirates',
          'Philippines',               'Norway',             'Malaysia',
              'Austria',               'Israel',             'Pakistan',
            'Singapore',              'Vietnam',         'South Africa',
              'Denmark']
Length: 40, 

 ---
## 4. Data Saving
Saving the DataFrames into `..\data\final\`.
- `df_gdp_co2_merged` --> `..\data\final\gdp_and_co2_emissions.csv`
- `df_gdp_co2_top40_2024` --> `..\data\final\gdp_and_co2_emissions_top40_2024.csv`

In [14]:
df_gdp_co2_merged.to_csv('../data/final/gdp_co2_emissions.csv', index=False)
df_gdp_co2_top40_2024.to_csv('../data/final/gdp_co2_emissions_top40_2024.csv', index=False)