# CO2 Emission Data Preprocessing
In this notebook, we will preprocess the CO2 emissions dataset to prepare it for machine learning modeling. This involves handling missing data, scaling the features, and splitting the data for training and testing.

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Load the Cleaned Dataset

In [2]:
# Load the cleaned dataset
data_path = '../data/cleaned/co2_emissions_cleaned.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,country_name,country_code,indicator_name,indicator_code,1990,1991,1992,1993,1994,1995,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,ABW,CO2 emissions (kt),EN.ATM.CO2E.KT,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,CO2 emissions (kt),EN.ATM.CO2E.KT,304614.7202,300135.9431,296834.6917,305267.7791,314596.2871,329784.5578,...,525337.8893,546775.865,568547.519,591680.972,576265.992,580219.242,590905.482,598720.9575,610723.5,544952.503
2,Afghanistan,AFG,CO2 emissions (kt),EN.ATM.CO2E.KT,2046.87,1941.37,1525.47,1527.89,1493.59,1457.69,...,11961.89,10208.13,9402.05,9281.34,10057.59,9294.93,10022.78,10972.38,11238.83,8709.47
3,Africa Western and Central,AFW,CO2 emissions (kt),EN.ATM.CO2E.KT,97190.345,110559.9338,121628.003,114995.869,106198.491,116198.614,...,165498.4949,170350.67,186486.66,196343.63,194363.64,201399.22,200550.9,210618.89,222990.35,215915.61
4,Angola,AGO,CO2 emissions (kt),EN.ATM.CO2E.KT,6564.2,6674.5,6877.3,9269.7,11298.23,12719.48,...,23865.8,23868.0,26958.7,29610.5,31648.9,29520.7,25064.8,23637.4,24382.9,19814.5


## Handle Missing Data

In [3]:
# Fill or drop missing values
df.fillna(method='ffill', inplace=True)  # Forward fill to handle missing values
# You can also choose to drop rows or columns with too many missing values if necessary
# df.dropna(inplace=True)
df.isnull().sum()  # Check for remaining missing values

  df.fillna(method='ffill', inplace=True)  # Forward fill to handle missing values


country_name      0
country_code      0
indicator_name    0
indicator_code    0
1990              1
1991              1
1992              1
1993              1
1994              1
1995              1
1996              1
1997              1
1998              1
1999              1
2000              1
2001              1
2002              1
2003              1
2004              1
2005              1
2006              1
2007              1
2008              1
2009              1
2010              1
2011              1
2012              1
2013              1
2014              1
2015              1
2016              1
2017              1
2018              1
2019              1
2020              1
dtype: int64

## Feature Scaling

In [8]:
# Scale the features using StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.iloc[:, 4:])  # Scale the year columns (numeric)
scaled_df = pd.DataFrame(scaled_features, columns=df.columns[4:])
scaled_df.insert(0, 'country_name', df['country_name'])  # Add back the non-scaled columns
scaled_df.insert(1, 'country_code', df['country_code'])
scaled_df.head()

Unnamed: 0,country_name,country_code,1990,1991,1992,1993,1994,1995,1996,1997,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,ABW,,,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,-0.201384,-0.190725,-0.192157,-0.189414,-0.185801,-0.184099,-0.181768,-0.177723,...,-0.178772,-0.17537,-0.172361,-0.167182,-0.170132,-0.169674,-0.169347,-0.170708,-0.16724,-0.175346
2,Afghanistan,AFG,-0.323819,-0.312282,-0.312815,-0.312951,-0.312642,-0.313597,-0.313593,-0.313429,...,-0.311304,-0.311735,-0.311179,-0.311137,-0.311254,-0.311695,-0.311244,-0.309897,-0.308613,-0.306257
3,Africa Western and Central,AFW,-0.285319,-0.268004,-0.263743,-0.266801,-0.270225,-0.268342,-0.262231,-0.262872,...,-0.271667,-0.271035,-0.267215,-0.2649,-0.265318,-0.263908,-0.264702,-0.262617,-0.258677,-0.255672
4,Angola,AGO,-0.321991,-0.310352,-0.310628,-0.309802,-0.30867,-0.309156,-0.308166,-0.307798,...,-0.308231,-0.308263,-0.30682,-0.306113,-0.305873,-0.306663,-0.30757,-0.306898,-0.305514,-0.303546


## Splitting the Data for Training and Testing

In [9]:
# Define X (features) and y (target) for ML
X = scaled_df.iloc[:, 2:]  # All numeric columns (scaled)
y = df.iloc[:, 3]  # Example: You may want to predict emissions for the latest year available

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((212, 31), (54, 31), (212,), (54,))

## Next Steps
- After preprocessing, the data is ready for modeling.
- You can now use this preprocessed data in your machine learning models (e.g., regression, time-series forecasting).