In [64]:
import optuna
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

from sklearn.metrics import accuracy_score, precision_score, recall_score

# Readmissions (Classification)


In [46]:
readmissions = pd.read_csv('readmissions_clean.csv')

# Split dataset into X and Y
X = readmissions.drop(['readmitted'], axis=1)
y = readmissions.readmitted

# splitting X and Y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=3, test_size=0.2)

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 2, 500)
    max_depth = trial.suggest_float('max_depth', 1, 50, log=True)
    max_features = trial.suggest_int('max_features',3,10)
    
    rf = RandomForestClassifier(n_estimators, max_depth= max_depth,\
                                                 max_features= max_features)
    rf.fit(X_train, y_train)
    
    return rf.score(X_test, y_test)
start = time.time()                                
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
end = time.time()

[32m[I 2023-04-17 17:43:50,649][0m A new study created in memory with name: no-name-c6fd30a5-beb6-4050-b3c0-67c4bdbf5c7b[0m
[32m[I 2023-04-17 17:43:51,533][0m Trial 0 finished with value: 0.6164 and parameters: {'n_estimators': 238, 'max_depth': 1.3613810011021896, 'max_features': 9}. Best is trial 0 with value: 0.6164.[0m
[32m[I 2023-04-17 17:43:52,353][0m Trial 1 finished with value: 0.6222 and parameters: {'n_estimators': 43, 'max_depth': 12.408195747512988, 'max_features': 9}. Best is trial 1 with value: 0.6222.[0m
[32m[I 2023-04-17 17:43:54,009][0m Trial 2 finished with value: 0.6202 and parameters: {'n_estimators': 431, 'max_depth': 2.260494232860116, 'max_features': 5}. Best is trial 1 with value: 0.6222.[0m
[32m[I 2023-04-17 17:43:54,540][0m Trial 3 finished with value: 0.6164 and parameters: {'n_estimators': 140, 'max_depth': 1.3214257784091687, 'max_features': 10}. Best is trial 1 with value: 0.6222.[0m
[32m[I 2023-04-17 17:43:54,668][0m Trial 4 finished with

[32m[I 2023-04-17 17:45:46,046][0m Trial 40 finished with value: 0.6122 and parameters: {'n_estimators': 382, 'max_depth': 16.84156350857101, 'max_features': 5}. Best is trial 32 with value: 0.6294.[0m
[32m[I 2023-04-17 17:45:48,204][0m Trial 41 finished with value: 0.6296 and parameters: {'n_estimators': 331, 'max_depth': 8.234081638948394, 'max_features': 3}. Best is trial 41 with value: 0.6296.[0m
[32m[I 2023-04-17 17:45:49,787][0m Trial 42 finished with value: 0.627 and parameters: {'n_estimators': 291, 'max_depth': 6.135042821988226, 'max_features': 3}. Best is trial 41 with value: 0.6296.[0m
[32m[I 2023-04-17 17:45:51,980][0m Trial 43 finished with value: 0.6294 and parameters: {'n_estimators': 336, 'max_depth': 8.566327719898807, 'max_features': 3}. Best is trial 41 with value: 0.6296.[0m
[32m[I 2023-04-17 17:45:53,600][0m Trial 44 finished with value: 0.6288 and parameters: {'n_estimators': 246, 'max_depth': 8.303564693220276, 'max_features': 3}. Best is trial 41 

[32m[I 2023-04-17 17:47:30,340][0m Trial 80 finished with value: 0.6264 and parameters: {'n_estimators': 374, 'max_depth': 12.923344002881072, 'max_features': 10}. Best is trial 41 with value: 0.6296.[0m
[32m[I 2023-04-17 17:47:32,781][0m Trial 81 finished with value: 0.6302 and parameters: {'n_estimators': 342, 'max_depth': 7.96187454979366, 'max_features': 4}. Best is trial 81 with value: 0.6302.[0m
[32m[I 2023-04-17 17:47:35,495][0m Trial 82 finished with value: 0.6288 and parameters: {'n_estimators': 347, 'max_depth': 8.291651986175589, 'max_features': 4}. Best is trial 81 with value: 0.6302.[0m
[32m[I 2023-04-17 17:47:37,426][0m Trial 83 finished with value: 0.6276 and parameters: {'n_estimators': 301, 'max_depth': 6.492815592223175, 'max_features': 4}. Best is trial 81 with value: 0.6302.[0m
[32m[I 2023-04-17 17:47:38,964][0m Trial 84 finished with value: 0.6304 and parameters: {'n_estimators': 257, 'max_depth': 7.725820147961449, 'max_features': 3}. Best is trial 8

In [47]:
"""OPTUNA BASED ON MAXIMIZED ACCURACY"""

rf = RandomForestClassifier(max_depth=7.9399781768472515 ,max_features=3 ,n_estimators=360 )
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6274
Precision: 0.6286990508096035
Recall: 0.4845094664371773


# Emissions Data (Regression)

In [7]:
emissions = pd.read_csv("emissions_cleaned.csv")

# Split dataset into X and Y
X = emissions.drop('co2_emissions', axis=1)
y = emissions["co2_emissions"]

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=3, test_size=0.2)

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 2, 500)
    max_depth = trial.suggest_float('max_depth', 1, 50, log=True)
    max_features = trial.suggest_int('max_features',3,10)
    
    rf = RandomForestRegressor(n_estimators, max_depth= max_depth,\
                                                 max_features= max_features)
    
    return cross_val_score(rf,X_train,y_train, n_jobs=-1, cv=3).mean()

start = time.time()                              
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
end = time.time()

[32m[I 2023-04-17 16:13:17,391][0m A new study created in memory with name: no-name-e5e9c5bf-8d31-435a-97a9-af7fe3c8b02d[0m
[32m[I 2023-04-17 16:13:21,343][0m Trial 0 finished with value: 0.9972168992349771 and parameters: {'n_estimators': 416, 'max_depth': 49.38191237192396, 'max_features': 8}. Best is trial 0 with value: 0.9972168992349771.[0m
[32m[I 2023-04-17 16:13:22,211][0m Trial 1 finished with value: 0.6524493849627261 and parameters: {'n_estimators': 334, 'max_depth': 1.0188093115530075, 'max_features': 4}. Best is trial 0 with value: 0.9972168992349771.[0m
[32m[I 2023-04-17 16:13:23,282][0m Trial 2 finished with value: 0.9168346526041383 and parameters: {'n_estimators': 335, 'max_depth': 3.2697051855478505, 'max_features': 8}. Best is trial 0 with value: 0.9972168992349771.[0m
[32m[I 2023-04-17 16:13:24,262][0m Trial 3 finished with value: 0.668934828871539 and parameters: {'n_estimators': 436, 'max_depth': 1.1553813397119719, 'max_features': 6}. Best is trial 0

[32m[I 2023-04-17 16:13:53,403][0m Trial 36 finished with value: 0.9970897015109914 and parameters: {'n_estimators': 144, 'max_depth': 28.160475478360944, 'max_features': 5}. Best is trial 33 with value: 0.9973700968879861.[0m
[32m[I 2023-04-17 16:13:53,981][0m Trial 37 finished with value: 0.9973105985431077 and parameters: {'n_estimators': 109, 'max_depth': 20.364327635193074, 'max_features': 10}. Best is trial 33 with value: 0.9973700968879861.[0m
[32m[I 2023-04-17 16:13:55,324][0m Trial 38 finished with value: 0.9972995707583233 and parameters: {'n_estimators': 321, 'max_depth': 12.787081762913099, 'max_features': 9}. Best is trial 33 with value: 0.9973700968879861.[0m
[32m[I 2023-04-17 16:13:55,804][0m Trial 39 finished with value: 0.9968820881187361 and parameters: {'n_estimators': 147, 'max_depth': 49.827475845311845, 'max_features': 4}. Best is trial 33 with value: 0.9973700968879861.[0m
[32m[I 2023-04-17 16:13:56,663][0m Trial 40 finished with value: 0.9971010517

[32m[I 2023-04-17 16:14:26,921][0m Trial 72 finished with value: 0.9972988881266193 and parameters: {'n_estimators': 241, 'max_depth': 18.79893167966114, 'max_features': 10}. Best is trial 33 with value: 0.9973700968879861.[0m
[32m[I 2023-04-17 16:14:28,440][0m Trial 73 finished with value: 0.9973721494531569 and parameters: {'n_estimators': 295, 'max_depth': 25.18403427761425, 'max_features': 10}. Best is trial 73 with value: 0.9973721494531569.[0m
[32m[I 2023-04-17 16:14:30,122][0m Trial 74 finished with value: 0.9973088917239815 and parameters: {'n_estimators': 326, 'max_depth': 25.425691699087245, 'max_features': 10}. Best is trial 73 with value: 0.9973721494531569.[0m
[32m[I 2023-04-17 16:14:31,731][0m Trial 75 finished with value: 0.997317380407383 and parameters: {'n_estimators': 311, 'max_depth': 27.19231884656238, 'max_features': 10}. Best is trial 73 with value: 0.9973721494531569.[0m
[32m[I 2023-04-17 16:14:33,162][0m Trial 76 finished with value: 0.99725483031

In [25]:
#OPTUNA EXPERIMENT

#CROSS VALIDATION EXPERIMENTS

rf = RandomForestRegressor(random_state=3 , max_depth =25.044094298506465, max_features = 10, n_estimators =234 )
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Get performance metrics
MSE = -cross_val_score(rf,X_train,y_train, cv=3, scoring='neg_mean_squared_error').mean()
MAE = -cross_val_score(rf,X_train,y_train, cv=3, scoring='neg_mean_absolute_error').mean()
MAPE = -cross_val_score(rf,X_train,y_train, cv=3, scoring='neg_mean_absolute_percentage_error').mean()
Accuracy = cross_val_score(rf,X_train,y_train, cv=3).mean()

print('MSE: ', MSE, ', MAE: ', MAE, ', MAPE: ', MAPE, 'Accuracy: ', Accuracy)

In [28]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 2, 500)
    max_depth = trial.suggest_float('max_depth', 1, 50, log=True)
    max_features = trial.suggest_int('max_features',3,10)
    
    rf = RandomForestRegressor(n_estimators, max_depth= max_depth,\
                                                 max_features= max_features)
    rf.fit(X_train, y_train)
    
    return rf.score(X_test, y_test)

start = time.time()
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
end = time.time()

[32m[I 2023-04-17 17:03:08,554][0m A new study created in memory with name: no-name-d19e1c4f-6ce0-4756-8ca7-ad13e127b9a2[0m
[32m[I 2023-04-17 17:03:10,203][0m Trial 0 finished with value: 0.9969827591727806 and parameters: {'n_estimators': 258, 'max_depth': 31.30336123652682, 'max_features': 9}. Best is trial 0 with value: 0.9969827591727806.[0m
[32m[I 2023-04-17 17:03:13,026][0m Trial 1 finished with value: 0.9969669623396261 and parameters: {'n_estimators': 496, 'max_depth': 18.485178668420282, 'max_features': 8}. Best is trial 0 with value: 0.9969827591727806.[0m
[32m[I 2023-04-17 17:03:13,325][0m Trial 2 finished with value: 0.6441422936944865 and parameters: {'n_estimators': 341, 'max_depth': 1.8070434460881768, 'max_features': 4}. Best is trial 0 with value: 0.9969827591727806.[0m
[32m[I 2023-04-17 17:03:13,780][0m Trial 3 finished with value: 0.8439216673334926 and parameters: {'n_estimators': 399, 'max_depth': 2.076262893319509, 'max_features': 5}. Best is trial 0

[32m[I 2023-04-17 17:04:05,487][0m Trial 36 finished with value: 0.9970498690364087 and parameters: {'n_estimators': 493, 'max_depth': 35.43716229417687, 'max_features': 10}. Best is trial 34 with value: 0.9970955150215156.[0m
[32m[I 2023-04-17 17:04:07,599][0m Trial 37 finished with value: 0.9966530272824335 and parameters: {'n_estimators': 444, 'max_depth': 38.18310765782636, 'max_features': 6}. Best is trial 34 with value: 0.9970955150215156.[0m
[32m[I 2023-04-17 17:04:10,539][0m Trial 38 finished with value: 0.9969758050715862 and parameters: {'n_estimators': 476, 'max_depth': 25.301381721044642, 'max_features': 9}. Best is trial 34 with value: 0.9970955150215156.[0m
[32m[I 2023-04-17 17:04:12,906][0m Trial 39 finished with value: 0.9969346670059875 and parameters: {'n_estimators': 413, 'max_depth': 36.31763563910012, 'max_features': 8}. Best is trial 34 with value: 0.9970955150215156.[0m
[32m[I 2023-04-17 17:04:14,465][0m Trial 40 finished with value: 0.9961910881982

[32m[I 2023-04-17 17:05:29,720][0m Trial 72 finished with value: 0.9969937050593517 and parameters: {'n_estimators': 481, 'max_depth': 31.050548854918492, 'max_features': 10}. Best is trial 42 with value: 0.9970962758252477.[0m
[32m[I 2023-04-17 17:05:31,181][0m Trial 73 finished with value: 0.9971363741921494 and parameters: {'n_estimators': 218, 'max_depth': 21.1642148629646, 'max_features': 10}. Best is trial 73 with value: 0.9971363741921494.[0m
[32m[I 2023-04-17 17:05:32,568][0m Trial 74 finished with value: 0.997038939793522 and parameters: {'n_estimators': 207, 'max_depth': 20.35190589107587, 'max_features': 10}. Best is trial 73 with value: 0.9971363741921494.[0m
[32m[I 2023-04-17 17:05:34,043][0m Trial 75 finished with value: 0.9970057527689499 and parameters: {'n_estimators': 269, 'max_depth': 13.03679876336475, 'max_features': 9}. Best is trial 73 with value: 0.9971363741921494.[0m
[32m[I 2023-04-17 17:05:35,336][0m Trial 76 finished with value: 0.9967803949315

In [32]:

"""OPTUNA BASED ON MAXIMIZED ACCURACY"""


rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print('Mean Absolute Error (MAE):', mean_absolute_error(y_test, y_pred))
print('Mean Absolute Percentage Error (MAPE):', mean_absolute_percentage_error(y_test, y_pred))
print('Mean Squared Error (MSE):', mean_squared_error(y_test, y_pred))

Mean Absolute Error (MAE): 1.65476548346164
Mean Absolute Percentage Error (MAPE): 0.006863157926103628
Mean Squared Error (MSE): 9.97977532917433


In [33]:
"""OPTUNA BASED ON MINIMIZED LOSS"""


rfr = RandomForestRegressor(max_depth=21.1642148629646,max_features= 10,n_estimators=218  )
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)

print('Mean Absolute Error (MAE):', mean_absolute_error(y_test, y_pred))
print('Mean Absolute Percentage Error (MAPE):', mean_absolute_percentage_error(y_test, y_pred))
print('Mean Squared Error (MSE):', mean_squared_error(y_test, y_pred))

Mean Absolute Error (MAE): 1.6590444377205071
Mean Absolute Percentage Error (MAPE): 0.006890220778268816
Mean Squared Error (MSE): 9.820637612422997


# GRID SEARCH CV EXPERIMENTS


In [None]:
# Split into X and Y
X = readmissions.drop('readmitted', axis=1)
y = readmissions["readmitted"]

# Split into train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 3)

# Fit Random Forest
rf = RandomForestClassifier(random_state=3)

# Choose a parameter grid for grid search
param_grid = {
    'max_depth': np.arange(1,51),
    'max_features': np.arange(3, 11),
    'n_estimators': np.arange(2,501)
}

# Time grid search
import time
start = time.time()

grid_search_classifier = GridSearchCV(estimator = rf, param_grid = param_grid)

grid_search_classifier.fit(X_train, y_train)

end = time.time()

print("Grid search execution time:",
      (end-start), "s")

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])