In [1]:
# MODIFY! 
# use RobustScaler! 
model_name = 'RF'

# Import Libraries & Data 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('./data/dl-pfe-ns.csv')
X = df.drop('throughput',axis=1)
y = df['throughput']

---

# Scale Data

In [4]:
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Scale the data
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(X_train)

RobustScaler()

In [5]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Determine Hyperparameters

In [6]:
# MODIFY!
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(warm_start=True)
model

RandomForestRegressor(warm_start=True)

In [7]:
p_grid = {
    'n_estimators':[100,200,500],
    'max_features':['auto', 'sqrt', 'log2']
}

In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
grid_model = GridSearchCV(
    estimator=model,
    param_grid =p_grid, 
    scoring='neg_mean_squared_error',
    cv=3, 
    verbose=1
)

In [10]:
grid_model.fit(X_train,y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [None]:
grid_model.best_params_

In [None]:
p_grid = {
    'n_estimators':[500,1000,2000],
    'max_features':['auto']
}

In [None]:
grid_model = GridSearchCV(
    estimator=model,
    param_grid =p_grid, 
    scoring='neg_mean_squared_error',
    cv=3, 
    verbose=1
)

In [None]:
grid_model.fit(X_train,y_train)

In [None]:
grid_model.best_params_

In [None]:
p_grid = {
    'n_estimators':[2000,5000],
    'max_features':['auto']
}

In [None]:
grid_model = GridSearchCV(
    estimator=model,
    param_grid =p_grid, 
    scoring='neg_mean_squared_error',
    cv=3, 
    verbose=1
)

In [None]:
grid_model.fit(X_train,y_train)

In [None]:
grid_model.best_params_

In [None]:
# hp = pd.Series(name=f'{model_name} HP', data=grid_model.best_params_)

In [None]:
# hp

In [None]:
# hp.to_csv(f'./hyperparameters/{model_name}-hp.csv')

In [None]:
p_grid = {
    'n_estimators':[5000,10000,20000],
    'max_features':['auto']
}

In [None]:
grid_model = GridSearchCV(
    estimator=model,
    param_grid =p_grid, 
    scoring='neg_mean_squared_error',
    cv=3, 
    verbose=1
)

In [None]:
grid_model.fit(X_train,y_train)

In [None]:
grid_model.best_params_

# Score Models

In [None]:
estimators = [5,10,25,50,100]
scores_list = []

from sklearn.ensemble import RandomForestRegressor

for e in estimators: 
    print(f'Starting {e} estimators')
    score_model = RandomForestRegressor(n_estimators=e)
    
    from sklearn.model_selection import cross_validate

    scores = cross_validate(
        score_model, 
        X_train, 
        y_train,
        scoring=[
            'neg_mean_absolute_error',
            'neg_mean_squared_error',
            'neg_root_mean_squared_error'
        ],
        cv=10
    )
    
    scores = pd.DataFrame(scores)
    scores
    mean_scores = scores.mean()
    scores_list.append(mean_scores)
    print(f'Done with {e} estimators')

In [None]:
scores_df = pd.DataFrame(scores_list)

In [None]:
scores_df

In [None]:
scores_df['Est'] = estimators

In [None]:
scores_df

# Export Scores

In [None]:
# mean_scores

In [None]:
scores_df = scores_df.rename(columns={
    'fit_time':'Fit Time',
    'score_time':'Score Time',
    'test_neg_mean_absolute_error':'MAE',
    'test_neg_mean_squared_error':'MSE',
    'test_neg_root_mean_squared_error':'RMSE'
})

In [None]:
scores_df

In [None]:
scores_df[['MAE','MSE','RMSE']] = scores_df[['MAE','MSE','RMSE']].apply(lambda x : abs(x))

In [None]:
scores_df
# we go for 25 estimations. Note that from 25 -> 50 estimations the error metrics worsen

In [None]:
scores_df.to_csv(f'./scores/{model_name}-score.csv')

# Export Optimized Model (25 Estimators)

In [None]:
est = 25

In [None]:
# MODIFY! 
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(X)

X_rf = scaler.transform(X)

rf_model = RandomForestRegressor(n_estimators=est)
rf_model.fit(X_rf, y)

In [None]:
y_pred_rf = rf_model.predict(X_rf)

In [None]:
sns.histplot(y_pred_rf,bins=40,kde=True)

In [None]:
fig,ax = plt.subplots()

sns.kdeplot(y, ax=ax, shade=True, label='Observations')
sns.kdeplot(y_pred_rf, ax=ax,shade=True,label='Predictions')

ax.legend(loc='best')

In [None]:
fig,ax = plt.subplots()

sns.ecdfplot(y, ax=ax, label='Observations')
sns.ecdfplot(y_pred_rf, ax=ax,label='Predictions')
plt.axvline(x=y.mean(),color='grey',linestyle='--')

ax.legend(loc='best')

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y,y_pred_rf)
mse = mean_squared_error(y,y_pred_rf)
rmse = np.sqrt(mse)

In [None]:
err_df = pd.DataFrame(data=[mae, mse, rmse],index=['MAE','MSE','RMSE'],columns=[f'{model_name}'])
err_df

In [None]:
err_df.to_csv(f'./model-err/{model_name}-{est}-err.csv')
# mean_scores.to_csv(f'./opt-model-err/{model_name}-err.csv')

In [None]:
from joblib import dump, load
dump(rf_model, f'./opt-models/{model_name}-{est}-model.joblib')

DONE!