<a href="https://colab.research.google.com/github/osmarbolivar/Nowcast_MachineLearning_RemoteSensing/blob/main/Final_Algorithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **GDP Nowcasting: A machine learning and remote sensing data-based approach for Bolivia - ALGORITHMS**

**Author:** Osmar Bolivar

## **1. Data**

In [91]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import plotly.graph_objects as go
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Load raw data

In [92]:
raw = pd.read_excel('drive/MyDrive/Research/GDP Nowcast/LAJCB/Final CODES/raw0.xlsx', index_col=0)
train = pd.read_csv('drive/MyDrive/Research/GDP Nowcast/LAJCB/Final CODES/train_set.csv', index_col=0)
validation = pd.read_csv('drive/MyDrive/Research/GDP Nowcast/LAJCB/Final CODES/validation_set.csv', index_col=0)
test = pd.read_csv('drive/MyDrive/Research/GDP Nowcast/LAJCB/Final CODES/test_set.csv', index_col=0)

Define the features and target variable for the model (training, validation)

In [93]:
scaler = StandardScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(train), columns=train.columns, index=train.index)
validation_scaled = pd.DataFrame(scaler.transform(validation), columns=validation.columns, index=validation.index)
test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns, index=test.index)

X_train = train_scaled.drop('igae', axis=1)
y_train = train_scaled['igae']

X_validation = validation_scaled.drop('igae', axis=1)
y_validation = validation_scaled['igae']

X_test = test_scaled.drop('igae', axis=1)

## **2. Algorithms**

In [94]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

### **2.1. Ridge Regression**

In [95]:
igae_nowcast = pd.DataFrame(
    {"igae": raw.loc['2013-04-01':, 'igae']}, index=raw.loc['2013-04-01':, :].index)

In [96]:
# Instantiate Ridge model
#ridge = Ridge()
ridge = Ridge(alpha=14.508, random_state=0)

# Fit Ridge on training set
ridge.fit(X_train, y_train)

# Make predictions on validation set using best model from GridSearchCV
y_val_pred_ridge = ridge.predict(X_validation)

# Calculate RMSE on validation set
mse_val_ridge = mean_squared_error(y_validation, y_val_pred_ridge, squared=True)
rmse_val_ridge = mean_squared_error(y_validation, y_val_pred_ridge, squared=False)
mae_val_ridge = mean_absolute_error(y_validation, y_val_pred_ridge)
print("Validation MSE: ", mse_val_ridge)
print("Validation RMSE: ", rmse_val_ridge)
print("Validation MAE: ", mae_val_ridge)

# Make predictions on test set using best model from GridSearchCV
y_test_pred_ridge = ridge.predict(X_test)

# Convert scaled predictions back to original units
y_test_final_ridge = y_test_pred_ridge * train.std()[0] + train.mean()[0]

igae_nowcast['ridge'] = igae_nowcast['igae']
num_months = len(y_test_final_ridge[:-3] )
for i in range(num_months):
  j = i+1
  igae_nowcast['ridge'][j] = igae_nowcast['igae'][i] * (1 + y_test_final_ridge[i])

igae_nowcast['ridge'][-3] = igae_nowcast['ridge'][-4] * (1 + y_test_final_ridge[-3])
igae_nowcast['ridge'][-2] = igae_nowcast['ridge'][-3] * (1 + y_test_final_ridge[-2])
igae_nowcast['ridge'][-1] = igae_nowcast['ridge'][-2] * (1 + y_test_final_ridge[-1])

Validation MSE:  0.29776090351032525
Validation RMSE:  0.545674723173362
Validation MAE:  0.3572025426069512


In [97]:
# Get the coefficients from the Ridge model
coef = ridge.coef_
feature_importance_ridge = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
feature_importance_ridge = feature_importance_ridge.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_ridge.columns = ['feat', 'imp_ridge']

### **2.2. Lasso Regression**

In [98]:
# Define the Lasso model with default parameters
#lasso = Lasso()
lasso = Lasso(alpha=0.010722672220103232, random_state=0)

# Fit the model on the training set
lasso.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred_lasso = lasso.predict(X_validation)

# Calculate RMSE on validation set
mse_val_lasso = mean_squared_error(y_validation, y_val_pred_lasso, squared=True)
rmse_val_lasso = mean_squared_error(y_validation, y_val_pred_lasso, squared=False)
mae_val_lasso = mean_absolute_error(y_validation, y_val_pred_lasso)
print("Validation MSE: ", mse_val_lasso)
print("Validation RMSE: ", rmse_val_lasso)
print("Validation MAE: ", mae_val_lasso)

# Evaluate the model on the test set
y_test_pred_lasso = lasso.predict(X_test)

# Convert scaled predictions back to original units
y_test_final_lasso = y_test_pred_lasso * train.std()[0] + train.mean()[0]

igae_nowcast['lasso'] = igae_nowcast['igae']
for i in range(num_months):
  j = i+1
  igae_nowcast['lasso'][j] = igae_nowcast['igae'][i] * (1 + y_test_final_lasso[i])

igae_nowcast['lasso'][-3] = igae_nowcast['lasso'][-4] * (1 + y_test_final_lasso[-3])
igae_nowcast['lasso'][-2] = igae_nowcast['lasso'][-3] * (1 + y_test_final_lasso[-2])
igae_nowcast['lasso'][-1] = igae_nowcast['lasso'][-2] * (1 + y_test_final_lasso[-1])

Validation MSE:  0.42192330718199045
Validation RMSE:  0.649556238659895
Validation MAE:  0.378012472089024


In [99]:
# Get the coefficients from the Ridge model
coef = lasso.coef_
feature_importance_lasso = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
feature_importance_lasso = feature_importance_lasso.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_lasso.columns = ['feat', 'imp_lasso']

### **2.3. ElasticNet**

In [100]:
# Define the ElasticNet model
#elastic_net = ElasticNet()
elastic_net = ElasticNet(alpha=0.087, l1_ratio=0.11, random_state=0)

# Fit the ElasticNet model to the training data
elastic_net.fit(X_train, y_train)

# Evaluate the ElasticNet model on the validation and test data
y_val_pred_enet = elastic_net.predict(X_validation)
# Calculate RMSE on validation set
mse_val_enet = mean_squared_error(y_validation, y_val_pred_enet, squared=True)
rmse_val_enet = mean_squared_error(y_validation, y_val_pred_enet, squared=False)
mae_val_enet = mean_absolute_error(y_validation, y_val_pred_enet)
print("Validation MSE: ", mse_val_enet)
print("Validation RMSE: ", rmse_val_enet)
print("Validation MAE: ", mae_val_enet)

# Evaluate the model on the test set
y_test_pred_enet = elastic_net.predict(X_test)

# Convert scaled predictions back to original units
y_test_final_enet = y_test_pred_enet * train.std()[0] + train.mean()[0]

igae_nowcast['enet'] = igae_nowcast['igae']
for i in range(num_months):
  j = i+1
  igae_nowcast['enet'][j] = igae_nowcast['igae'][i] * (1 + y_test_final_enet[i])

igae_nowcast['enet'][-3] = igae_nowcast['enet'][-4] * (1 + y_test_final_enet[-3])
igae_nowcast['enet'][-2] = igae_nowcast['enet'][-3] * (1 + y_test_final_enet[-2])
igae_nowcast['enet'][-1] = igae_nowcast['enet'][-2] * (1 + y_test_final_enet[-1])

Validation MSE:  0.35010819403700205
Validation RMSE:  0.5916994118950957
Validation MAE:  0.3763873181967723


In [101]:
# Get the coefficients from the Ridge model
coef = elastic_net.coef_
feature_importance_en = pd.DataFrame({'Feature': X_train.columns, 'Importance': coef})
feature_importance_en = feature_importance_en.sort_values('Importance', ascending=False).reset_index(drop=True)
feature_importance_en.columns = ['feat', 'imp_en']

### **2.4 Decision Tree Regressor**

In [102]:
# Define the model with default parameters
#dt = DecisionTreeRegressor(random_state=42)
dt = DecisionTreeRegressor(random_state=42, max_depth=4)

# Fit the model to the data
dt.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred_dt = dt.predict(X_validation)

# Evaluate the model on the validation set
mse_val_dt = mean_squared_error(y_validation, y_val_pred_dt, squared=True)
rmse_val_dt = mean_squared_error(y_validation, y_val_pred_dt, squared=False)
mae_val_dt = mean_absolute_error(y_validation, y_val_pred_dt)
print("Validation MSE: ", mse_val_dt)
print("Validation RMSE: ", rmse_val_dt)
print("Validation MAE: ", mae_val_dt)

# Evaluate the model on the test set
y_test_pred_dt = dt.predict(X_test)

# Convert scaled predictions back to original units
y_test_final_dt = y_test_pred_dt * train.std()[0] + train.mean()[0]

igae_nowcast['dt'] = igae_nowcast['igae']
for i in range(num_months):
  j = i+1
  igae_nowcast['dt'][j] = igae_nowcast['igae'][i] * (1 + y_test_final_dt[i])

igae_nowcast['dt'][-3] = igae_nowcast['dt'][-4] * (1 + y_test_final_dt[-3])
igae_nowcast['dt'][-2] = igae_nowcast['dt'][-3] * (1 + y_test_final_dt[-2])
igae_nowcast['dt'][-1] = igae_nowcast['dt'][-2] * (1 + y_test_final_dt[-1])

Validation MSE:  0.7132696055690861
Validation RMSE:  0.8445529027651768
Validation MAE:  0.42039508905074213


In [103]:
# Get feature importance scores
importances = dt.feature_importances_
feature_importance_dt = pd.DataFrame({"feature": X_train.columns, "importance": importances})
feature_importance_dt = feature_importance_dt.sort_values("importance", ascending=False).reset_index(drop=True)
feature_importance_dt.columns = ['feat', 'imp_dt']

### **2.6. AdaBoost Regressor**

In [104]:
# Define the AdaBoost Regressor with default parameters
#ada = AdaBoostRegressor(random_state=0)
ada = AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=5),learning_rate=2.1,loss='exponential',n_estimators=60,random_state=0)

# Fit the model on training data
ada.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred_ada = ada.predict(X_validation)

# Evaluate the model on the validation set
mse_val_ada = mean_squared_error(y_validation, y_val_pred_ada, squared=True)
rmse_val_ada = mean_squared_error(y_validation, y_val_pred_ada, squared=False)
mae_val_ada = mean_absolute_error(y_validation, y_val_pred_ada)
print("Validation MSE: ", mse_val_ada)
print("Validation RMSE: ", rmse_val_ada)
print("Validation MAE: ", mae_val_ada)

# Evaluate the model on the test set
y_test_pred_ada = ada.predict(X_test)

# Convert scaled predictions back to original units
y_test_final_ada = y_test_pred_ada * train.std()[0] + train.mean()[0]

igae_nowcast['ada'] = igae_nowcast['igae']
for i in range(num_months):
  j = i+1
  igae_nowcast['ada'][j] = igae_nowcast['igae'][i] * (1 + y_test_final_ada[i])

igae_nowcast['ada'][-3] = igae_nowcast['ada'][-4] * (1 + y_test_final_ada[-3])
igae_nowcast['ada'][-2] = igae_nowcast['ada'][-3] * (1 + y_test_final_ada[-2])
igae_nowcast['ada'][-1] = igae_nowcast['ada'][-2] * (1 + y_test_final_ada[-1])

Validation MSE:  0.5239494754816324
Validation RMSE:  0.7238435435103586
Validation MAE:  0.33257781611253423


In [105]:
# Create a DataFrame with the feature importance values
feature_importance_ada = pd.DataFrame({'Feature': X_train.columns, 'Importance': ada.feature_importances_})
feature_importance_ada = feature_importance_ada.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_ada.columns = ['feat', 'imp_ada']

### **2.7. Gradient Boost Regressor**

In [106]:
# Define the model
#gbr = GradientBoostingRegressor()
gbr = GradientBoostingRegressor(max_depth=7, min_samples_split=10, n_estimators=135, random_state=123, subsample=0.4)

# Fit the model on the training set
gbr.fit(X_train, y_train)

y_val_pred_gbr = gbr.predict(X_validation)

# Compute the mean squared error of the predictions on the validation set
mse_val_gbr = mean_squared_error(y_validation, y_val_pred_gbr, squared=True)
rmse_val_gbr = mean_squared_error(y_validation, y_val_pred_gbr, squared=False)
mae_val_gbr = mean_absolute_error(y_validation, y_val_pred_gbr)
print("Validation MSE: ", mse_val_gbr)
print("Validation RMSE: ", rmse_val_gbr)
print("Validation MAE: ", mae_val_gbr)

# Use the model to make predictions on the test set
y_test_pred_gbr = gbr.predict(X_test)

# Convert scaled predictions back to original units
y_test_final_gbr = y_test_pred_gbr * train.std()[0] + train.mean()[0]

igae_nowcast['gbr'] = igae_nowcast['igae']
for i in range(num_months):
  j = i+1
  igae_nowcast['gbr'][j] = igae_nowcast['igae'][i] * (1 + y_test_final_gbr[i])

igae_nowcast['gbr'][-3] = igae_nowcast['gbr'][-4] * (1 + y_test_final_gbr[-3])
igae_nowcast['gbr'][-2] = igae_nowcast['gbr'][-3] * (1 + y_test_final_gbr[-2])
igae_nowcast['gbr'][-1] = igae_nowcast['gbr'][-2] * (1 + y_test_final_gbr[-1])

Validation MSE:  0.390874264522017
Validation RMSE:  0.6251993798157649
Validation MAE:  0.29243881369974134


In [107]:
# Create a DataFrame with the feature importance values
feature_importance_gbr = pd.DataFrame({'Feature': X_train.columns, 'Importance': gbr.feature_importances_})
feature_importance_gbr = feature_importance_gbr.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_gbr.columns = ['feat', 'imp_gbr']

### **2.8. Random Forest**

In [108]:
# Define the Random Forest Regression model
#rf_reg = RandomForestRegressor(random_state=42)
rf_reg = RandomForestRegressor(min_samples_split=3, n_estimators=215, random_state=123)

# Fit the model to the training data and make predictions on the validation set
rf_reg.fit(X_train, y_train)

# Use the model to make predictions on the validation set
y_val_pred_rf = rf_reg.predict(X_validation)

# Compute the mean squared error of the predictions on the validation set
mse_val_rf = mean_squared_error(y_validation, y_val_pred_rf, squared=True)
rmse_val_rf = mean_squared_error(y_validation, y_val_pred_rf, squared=False)
mae_val_rf = mean_absolute_error(y_validation, y_val_pred_rf)
print("Validation MSE: ", mse_val_rf)
print("Validation RMSE: ", rmse_val_rf)
print("Validation MAE: ", mae_val_rf)

# Use the model to make predictions on the test set
y_test_pred_rf = rf_reg.predict(X_test)

# Convert scaled predictions back to original units
y_test_final_rf = y_test_pred_rf * train.std()[0] + train.mean()[0]

igae_nowcast['rf'] = igae_nowcast['igae']
for i in range(num_months):
  j = i+1
  igae_nowcast['rf'][j] = igae_nowcast['igae'][i] * (1 + y_test_final_rf[i])

igae_nowcast['rf'][-3] = igae_nowcast['rf'][-4] * (1 + y_test_final_rf[-3])
igae_nowcast['rf'][-2] = igae_nowcast['rf'][-3] * (1 + y_test_final_rf[-2])
igae_nowcast['rf'][-1] = igae_nowcast['rf'][-2] * (1 + y_test_final_rf[-1])

Validation MSE:  0.5316551753278117
Validation RMSE:  0.7291468818611321
Validation MAE:  0.3591653747288345


In [109]:
# Create a DataFrame with the feature importance values
feature_importance_rf = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf_reg.feature_importances_})
feature_importance_rf = feature_importance_rf.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_rf.columns = ['feat', 'imp_rf']

### **2.8. Extra Trees Rregressor**

In [110]:
# Define the Extra Trees Regression model
#et_reg = ExtraTreesRegressor(random_state=42)
et_reg = ExtraTreesRegressor(bootstrap=True, max_depth=13, max_samples=0.98, oob_score=True, random_state=42)


# Fit the model to the training data and make predictions on the validation set
et_reg.fit(X_train, y_train)

# Use the model to make predictions on the validation set
y_val_pred_et = et_reg.predict(X_validation)

# Compute the mean squared error of the predictions on the validation set
mse_val_et = mean_squared_error(y_validation, y_val_pred_et, squared=True)
rmse_val_et = mean_squared_error(y_validation, y_val_pred_et, squared=False)
mae_val_et = mean_absolute_error(y_validation, y_val_pred_et)
print("Validation MSE: ", mse_val_et)
print("Validation RMSE: ", rmse_val_et)
print("Validation MAE: ", mae_val_et)

# Use the model to make predictions on the test set
y_test_pred_et = et_reg.predict(X_test)

# Convert scaled predictions back to original units
y_test_final_et = y_test_pred_et * train.std()[0] + train.mean()[0]

igae_nowcast['et'] = igae_nowcast['igae']
for i in range(num_months):
  j = i+1
  igae_nowcast['et'][j] = igae_nowcast['igae'][i] * (1 + y_test_final_et[i])

igae_nowcast['et'][-3] = igae_nowcast['et'][-4] * (1 + y_test_final_et[-3])
igae_nowcast['et'][-2] = igae_nowcast['et'][-3] * (1 + y_test_final_et[-2])
igae_nowcast['et'][-1] = igae_nowcast['et'][-2] * (1 + y_test_final_et[-1])

Validation MSE:  0.4961795824131694
Validation RMSE:  0.7044001578741798
Validation MAE:  0.3557257083203328


In [111]:
# Create a DataFrame with the feature importance values
feature_importance_et = pd.DataFrame({'Feature': X_train.columns, 'Importance': et_reg.feature_importances_})
feature_importance_et = feature_importance_et.sort_values(by='Importance', ascending=False).reset_index(drop=True)
feature_importance_et.columns = ['feat', 'imp_et']

## **4. Feature importance**

In [112]:
feature_importance_all = pd.merge(feature_importance_ridge, feature_importance_lasso, on='feat', how='outer')
feature_importance_all = pd.merge(feature_importance_all, feature_importance_en, on='feat', how='outer')
feature_importance_all = pd.merge(feature_importance_all, feature_importance_dt, on='feat', how='outer')
feature_importance_all = pd.merge(feature_importance_all, feature_importance_ada, on='feat', how='outer')
feature_importance_all = pd.merge(feature_importance_all, feature_importance_gbr, on='feat', how='outer')
feature_importance_all = pd.merge(feature_importance_all, feature_importance_rf, on='feat', how='outer')
feature_importance_all = pd.merge(feature_importance_all, feature_importance_et, on='feat', how='outer')
feature_importance_all = feature_importance_all.sort_values(by='imp_ridge', ascending=False)