In [None]:
import pandas as pd

extended_data_imputed = pd.read_csv("/content/world_bank_extended_data_imputed.csv")

In [None]:
extended_data_imputed.head()

Unnamed: 0,country,date,CO2_emissions,GDP_current_US,Population_total,Energy_use_kg_of_oil_equivalent_per_capita,Urban_population_percentage_of_total,Educational_attainment_at_least_Bachelors_or_equivalent_population_25_older_than_total_percentage,Passenger_cars_per_1000_people,Renewable_electricity_output_percentage_of_total
0,Afghanistan,2000,1078.12,3521418000.0,19542982.0,450.578201,22.078,4.036244,18.333333,74.989094
1,Afghanistan,2001,1088.638,2813572000.0,19688632.0,446.229227,22.169,4.036244,18.333333,72.81146
2,Afghanistan,2002,1403.03,3825701000.0,21000256.0,466.081702,22.261,4.036244,13.333333,79.063971
3,Afghanistan,2003,1653.207,4520947000.0,22645130.0,448.841374,22.353,4.036244,18.9,70.249729
4,Afghanistan,2004,1292.307,5224897000.0,23553551.0,448.841374,22.5,4.036244,18.9,70.890841


In [10]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score

# Prepare the data
X = extended_data_imputed.drop(['CO2_emissions', 'date', 'country'], axis=1)
y = extended_data_imputed['CO2_emissions']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.3],
    'alpha': [0.1, 0.5, 1.0],   # L1 regularization
    'lambda': [1.0, 1.5, 2.0]   # L2 regularization
}

# Initialize the XGBoost model
xgb_model = xgb.XGBRegressor(random_state=42)

# Implement K-Fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           cv=kfold, scoring='neg_mean_squared_error',
                           n_jobs=-1, verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Train the best model on the entire training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

# Evaluate cross-validation results
cv_rmse = (-grid_search.best_score_) ** 0.5

print("Best Parameters:", best_params)
print("RMSE:", rmse)
print("R-squared:", r2)
print("Cross-Validated RMSE (Best Model):", cv_rmse)

# Simulate GDP increase
simulated_data_with_gdp_increase = extended_data_imputed.copy()
simulated_data_with_gdp_increase['GDP_current_US'] = simulated_data_with_gdp_increase['GDP_current_US'] * 1.10

# Predict CO2 emissions for the simulated scenario using the best model
X_simulated = simulated_data_with_gdp_increase.drop(['CO2_emissions', 'date', 'country'], axis=1)
simulated_predictions = best_model.predict(X_simulated)

# Add the predictions to the original dataframe
extended_data_imputed['predicted_CO2_emissions_with_GDP_increase'] = simulated_predictions



Fitting 5 folds for each of 2916 candidates, totalling 14580 fits
Best Parameters: {'alpha': 0.5, 'colsample_bytree': 0.8, 'gamma': 0, 'lambda': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.8}
RMSE: 180144.51333440977
R-squared: 0.9963889273568101
Cross-Validated RMSE (Best Model): 155138.9070374191


In [11]:
extended_data_imputed.columns

Index(['country', 'date', 'CO2_emissions', 'GDP_current_US',
       'Population_total', 'Energy_use_kg_of_oil_equivalent_per_capita',
       'Urban_population_percentage_of_total',
       'Educational_attainment_at_least_Bachelors_or_equivalent_population_25_older_than_total_percentage',
       'Passenger_cars_per_1000_people',
       'Renewable_electricity_output_percentage_of_total',
       'predicted_CO2_emissions_with_GDP_increase'],
      dtype='object')

In [12]:
extended_data_imputed[extended_data_imputed['country'] == 'Mexico'][['country', 'date', 'CO2_emissions', 'predicted_CO2_emissions_with_GDP_increase']]

Unnamed: 0,country,date,CO2_emissions,predicted_CO2_emissions_with_GDP_increase
3323,Mexico,2000,379176.0,383305.75
3324,Mexico,2001,378830.6,448421.125
3325,Mexico,2002,386004.6,380206.5625
3326,Mexico,2003,404692.2,416539.40625
3327,Mexico,2004,414102.0,440847.53125
3328,Mexico,2005,432187.4,453753.53125
3329,Mexico,2006,448302.8,390003.59375
3330,Mexico,2007,457121.0,454939.6875
3331,Mexico,2008,459548.1,623411.5
3332,Mexico,2009,448372.5,412866.28125
