In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

# Load the dataset
data = pd.read_csv('Data/World Bank Dataset.csv')

# Assuming 'GDP' is the target variable and the rest are features
X = data.drop(['Country Name', 'Year', 'GDP (current US$)'], axis=1)
y = data['GDP (current US$)']

# Print the number of features
num_features = X.shape[1]  # Number of columns in X
print(f'Number of features in the input of the model: {num_features}')

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100],
    'max_features': ['sqrt'],  # Removed 'auto'
    'max_depth': [20],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

# Setup the GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best estimator
best_model = grid_search.best_estimator_

# Save the best model
joblib.dump(best_model, 'optimized_random_forest_gdp_model.pkl')

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print metrics
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'R-squared (R2) Score: {r2}')

Number of features in the input of the model: 21
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Mean Absolute Error (MAE): 52704434994.39964
Mean Squared Error (MSE): 2.6754654403554206e+22
R-squared (R2) Score: 0.9906273404294118


In [2]:
feature_array = np.array([
    75.0,  # Access to clean fuels and technologies for cooking (% of population)
    90.0,  # Access to electricity (% of population)
    10.0,  # Total alcohol consumption per capita (liters)
    5.0,   # CO2 emissions (metric tons per capita)
    500.0, # Current health expenditure per capita (current US$)
    95.0,  # Immunization, DPT (%)
    90.0,  # Immunization, HepB3 (%)
    85.0,  # Immunization, measles (%)
    75.0,  # Life expectancy at birth (years)
    3.0,   # Hospital beds (per 1,000 people)
    20.0,  # Mortality from CVD, cancer, diabetes, or CRD (%)
    5.0,   # Mortality rate, adult, female (per 1,000 female adults)
    6.0,   # Mortality rate, adult, male (per 1,000 male adults)
    10.0,  # Mortality caused by road traffic injury (per 100,000 population)
    30.0,  # Mortality rate, under-5 (per 1,000 live births)
    1.5,   # Physicians (per 1,000 people)
    25.0,  # PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)
    1000000, # Population, total
    5.0,   # Prevalence of undernourishment (% of population)
    12.0,  # Suicide mortality rate (per 100,000 population)
    8.0    # Unemployment, total (% of total labor force)
])
feature_array = feature_array.reshape(1, -1)  # Reshape to 2D array

# Predict GDP using the trained model
predicted_gdp = best_model.predict([[9.96000000e+01, 1.00000000e+02, 1.09010700e+01, 6.12342695e+00,
  3.90249725e+02, 9.80000000e+01, 9.70000000e+01, 9.80000000e+01,
  7.42268293e+01, 1.08300000e+01, 2.38000000e+01, 7.98390000e+01,
  2.31323000e+02, 7.60000000e+00, 3.10000000e+00, 4.43400000e+00,
  1.64235129e+01, 9.41975800e+06, 2.50000000e+00, 2.12000000e+01,
  4.16000000e+00]])

# Print the predicted GDP
print(f'Predicted GDP: ${predicted_gdp[0]:,.2f}')

Predicted GDP: $167,507,042,245.29


