In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.metrics import r2_score
from sklearn import metrics

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, GRU
from tensorflow.keras.regularizers import l1, l2, l1_l2
from tensorflow.keras.callbacks import EarlyStopping

# ensemble
from sklearn.ensemble import RandomForestRegressor

# Set random seed.
np.random.seed(40)

### Read in Data

In [None]:
df = pd.read_csv('../datasets/cleaned_data.csv')
df.head()

-----
# Neural Nets Model

### Creating Variables, Train/Test Split & Feature Scaling

In [None]:
X = df.drop(columns= ['HDI', 'Country Name', 'Country Code'])
y = df['HDI'].values

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

### Creating the Initial NN Regression Model

In [None]:
model_nn_1 = Sequential()

model_nn_1.add(Dense(16, activation = 'relu', input_shape = (X.shape[1],)))
model_nn_1.add(Dense(32, activation = 'relu'))
model_nn_1.add(Dense(1, activation = None))

model_nn_1.compile(loss = 'mse',
              optimizer = 'adam')

history = model_nn_1.fit(X_train_sc, y_train,
          epochs = 100,
          batch_size = 512,
          validation_data = (X_test_sc, y_test),
          verbose = 1)

train_loss = history.history['loss']
test_loss = history.history['val_loss']
epoch_labels = history.epoch

plt.figure(figsize=(11, 8))


plt.plot(train_loss, label='Training Loss', color='blue')
plt.plot(test_loss, label='Testing Loss', color='orange')

# Set title
plt.title('Training and Testing Loss by Epoch', fontsize=25)
plt.xlabel('Epoch', fontsize=18)
plt.ylabel('MSE', fontsize=18)
plt.xticks(epoch_labels, epoch_labels) 

plt.legend(fontsize=18);

In [None]:
pred = model_nn_1.predict(X_test_sc)

print(r2_score(y_test, pred))

### Conclusion

It is clear that we do not have enough data to run an accurate Neural Net Regression Model.

------

# Random Forest Regressor

In [None]:
X = df.drop(columns=['Country Name', 'Country Code', 'HDI'])
y = df['HDI']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state= 42)

In [None]:
# instantiate RF Regressor
rf_reg = RandomForestRegressor()

In [None]:
rf_reg_params = {
    'n_estimators': [100,150,200,250],
    'max_depth': [None, 1,2,3,4,5,10],
    'min_samples_leaf': [2,4,6,8,10]
}

gs=GridSearchCV(rf_reg,
               param_grid = rf_reg_params,
               cv=5,
               n_jobs = -1)

gs.fit(X_train, y_train)

print(gs.best_score_)

rf_reg = gs.best_params_

In [None]:
print(f'Training R-Squared: {gs.score(X_train, y_train)}')
print(f'Training R-Squared: {gs.score(X_test, y_test)}')

In [None]:
Random Forest is overfit compared to the OLS (with select Lasso features).