In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score


extended_data_imputed = pd.read_csv("world_bank_extended_data_imputed.csv")

In [None]:
extended_data_imputed.head()

In [None]:
print(xgb.rabit.get_config())


In [None]:


# Prepare the data
X = extended_data_imputed.drop(['CO2_emissions', 'date', 'country'], axis=1)
y = extended_data_imputed['CO2_emissions']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.3],
    'alpha': [0.1, 0.5, 1.0],   # L1 regularization
    'lambda': [1.0, 1.5, 2.0]   # L2 regularization
}

# Initialize the XGBoost model with GPU support
xgb_model = xgb.XGBRegressor(random_state=42, tree_method="gpu_hist")

# Implement K-Fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           cv=kfold, scoring='neg_mean_squared_error',
                           n_jobs=-1, verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Train the best model on the entire training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

# Evaluate cross-validation results
cv_rmse = (-grid_search.best_score_) ** 0.5

print("Best Parameters:", best_params)
print("RMSE:", rmse)
print("R-squared:", r2)
print("Cross-Validated RMSE (Best Model):", cv_rmse)

# Simulate GDP increase
simulated_data_with_gdp_increase = extended_data_imputed.copy()
simulated_data_with_gdp_increase['GDP_current_US'] = simulated_data_with_gdp_increase['GDP_current_US'] * 1.10

# Predict CO2 emissions for the simulated scenario using the best model
X_simulated = simulated_data_with_gdp_increase.drop(['CO2_emissions', 'date', 'country'], axis=1)
simulated_predictions = best_model.predict(X_simulated)

# Add the predictions to the original dataframe
extended_data_imputed['predicted_CO2_emissions_with_GDP_increase'] = simulated_predictions


In [None]:
extended_data_imputed.columns

In [None]:
extended_data_imputed[extended_data_imputed['country'] == 'Mexico'][['country', 'date', 'CO2_emissions', 'predicted_CO2_emissions_with_GDP_increase']]