In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error
from sklearn.ensemble import  RandomForestRegressor


def get_ranges(df):
    ranges = pd.DataFrame(columns=df.columns)
    ranges.loc['min'] = df.min()
    ranges.loc['max'] = df.max()
    ranges.loc['range'] = df.max() - df.min()
    return ranges

In [14]:
data = pd.read_csv('./initial_data.csv')
data.drop(['PM 2'], axis=1, inplace=True)
X = data.iloc[:, :8]
y = data.iloc[:, 8:12]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
f_reg = RandomForestRegressor(random_state=42)

In [23]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=f_reg, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters:", best_params)
print("Best Estimator:", best_estimator)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best Estimator: RandomForestRegressor(n_estimators=300, random_state=42)


In [24]:
best_estimator.fit(X_train, y_train)
best_forest = RandomForestRegressor(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300, random_state=42)
pred_rf = best_estimator.predict(X_test)
pred_df = pd.DataFrame(pred_rf, columns=y_test.columns)

In [29]:
mse = mean_squared_error(y_test, pred_rf, multioutput='raw_values')


Normalized MSE values: [1.         0.         0.05242162 0.05884346]


In [22]:
real_ranges = pd.DataFrame()
real_ranges['min'] = y_test.min(axis=0)
real_ranges['max'] = y_test.max(axis=0)
real_ranges['range'] = real_ranges['max'] - real_ranges['min']
real_ranges = real_ranges.transpose()
real_ranges, get_ranges(pred_df)


(              NOx      PM 1         CO2  Pressure cylinder
 min     17.977838  0.151139    2.891628          44.946302
 max    465.211587  4.095885  104.665828         107.390162
 range  447.233749  3.944747  101.774199          62.443860,
               NOx      PM 1         CO2  Pressure cylinder
 min     26.245347  0.324763    7.493821          50.985520
 max    445.250855  3.187267  105.734629         113.005537
 range  419.005509  2.862504   98.240808          62.020018)

In [30]:
best_forest = RandomForestRegressor(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300, random_state=42)
best_forest.fit(X_train, y_train)

In [32]:
best_forest_pred = best_forest.predict(X_test)
best_forest_pred_df = pd.DataFrame(best_forest_pred, columns=y_test.columns)
best_forest_pred_df

Unnamed: 0,NOx,PM 1,CO2,Pressure cylinder
0,57.018741,0.662527,19.470886,56.264969
1,147.082675,2.828863,65.678758,74.684987
2,61.580637,1.5638,34.818845,60.833836
3,83.626573,2.428746,38.393437,63.091914
4,35.212873,0.991552,21.754456,55.882035
5,124.750126,2.457359,52.390839,68.700538
6,447.709048,1.403639,74.748467,104.028354
7,194.418582,3.316747,98.153912,101.344741
8,26.818674,0.335939,7.882922,51.206957
9,27.778951,0.346067,8.19546,51.604162


In [34]:
get_ranges(best_forest_pred_df), get_ranges(pred_df), real_ranges

(              NOx      PM 1         CO2  Pressure cylinder
 min     26.818674  0.335939    7.882922          51.206957
 max    447.709048  3.316747  103.930267         112.038075
 range  420.890374  2.980808   96.047345          60.831118,
               NOx      PM 1         CO2  Pressure cylinder
 min     26.818674  0.335939    7.882922          51.206957
 max    447.709048  3.316747  103.930267         112.038075
 range  420.890374  2.980808   96.047345          60.831118,
               NOx      PM 1         CO2  Pressure cylinder
 min     17.977838  0.151139    2.891628          44.946302
 max    465.211587  4.095885  104.665828         107.390162
 range  447.233749  3.944747  101.774199          62.443860)

In [35]:
normal_forest = RandomForestRegressor(random_state=42)
normal_forest.fit(X_train, y_train)
normal_forest_pred = normal_forest.predict(X_test)
normal_forest_pred_df = pd.DataFrame(normal_forest_pred, columns=y_test.columns)
normal_forest_pred_df

Unnamed: 0,NOx,PM 1,CO2,Pressure cylinder
0,57.115847,0.66277,19.779086,56.438896
1,144.94518,2.797546,65.596756,74.133233
2,62.029861,1.596748,36.306143,61.041218
3,84.490702,2.478613,38.174836,63.37873
4,34.68624,0.966439,21.594664,55.673871
5,123.596813,2.495382,52.287342,68.227548
6,445.250855,1.459917,76.205244,104.782694
7,199.274195,3.187267,98.79766,102.642585
8,26.245347,0.324763,7.493821,50.98552
9,26.294324,0.33886,7.823929,51.136788


In [37]:
get_ranges(best_forest_pred_df), get_ranges(normal_forest_pred_df), real_ranges

(              NOx      PM 1         CO2  Pressure cylinder
 min     26.818674  0.335939    7.882922          51.206957
 max    447.709048  3.316747  103.930267         112.038075
 range  420.890374  2.980808   96.047345          60.831118,
               NOx      PM 1         CO2  Pressure cylinder
 min     26.245347  0.324763    7.493821          50.985520
 max    445.250855  3.187267  105.734629         113.005537
 range  419.005509  2.862504   98.240808          62.020018,
               NOx      PM 1         CO2  Pressure cylinder
 min     17.977838  0.151139    2.891628          44.946302
 max    465.211587  4.095885  104.665828         107.390162
 range  447.233749  3.944747  101.774199          62.443860)

In [38]:
print('Metrics for Best Random Forest Model:')
print('Testing R2 Score: ', r2_score(y_test, best_forest_pred_df)*100)
print('Testing RMSE: ', np.sqrt(mean_squared_error(y_test, best_forest_pred_df)))
print('Testing MAE: ', mean_absolute_error(y_test, best_forest_pred_df))
print('Testing MSE: ', mean_squared_error(y_test, best_forest_pred_df))

print('Metrics for Normal Random Forest Model:')
print('Testing R2 Score: ', r2_score(y_test, normal_forest_pred_df)*100)
print('Testing RMSE: ', np.sqrt(mean_squared_error(y_test, normal_forest_pred_df)))
print('Testing MAE: ', mean_absolute_error(y_test, normal_forest_pred_df))
print('Testing MSE: ', mean_squared_error(y_test, normal_forest_pred_df))

Metrics for Best Random Forest Model:
Testing R2 Score:  91.41433347793777
Testing RMSE:  8.345304184136653
Testing MAE:  4.6604879259154375
Testing MSE:  69.64410192576874
Metrics for Normal Random Forest Model:
Testing R2 Score:  91.66637937466038
Testing RMSE:  8.272078897401236
Testing MAE:  4.460436728582102
Testing MSE:  68.42728928483086


In [42]:
mse_forest = mean_squared_error(y_test, normal_forest_pred_df, multioutput='raw_values')
normal_forest_pred_ranges = get_ranges(normal_forest_pred_df)
normalized_mse_forest = mse_forest / normal_forest_pred_ranges.loc['range']
normalized_mse_forest




NOx                  0.584900
PM 1                 0.155139
CO2                  0.145767
Pressure cylinder    0.223615
Name: range, dtype: float64

In [47]:
normalized_squared_mse_forest = mse_forest / normal_forest_pred_ranges.loc['range'] ** 2
normalized_squared_mse_forest

NOx                  0.001396
PM 1                 0.054197
CO2                  0.001484
Pressure cylinder    0.003606
Name: range, dtype: float64

In [48]:
from sklearn.model_selection import RandomizedSearchCV

parameters={'n_estimators':[int(x) for x in np.linspace(50,2000,200)],
            'max_depth':[int(x) for x in np.linspace(1,50,30)],
            'criterion':["mse", "mae"],
            'min_samples_split': [int(x) for x in np.linspace(1,50,30)],
            'min_samples_leaf': [int(x) for x in np.linspace(1,50,30)]}


In [50]:
parameters['criterion'] = ['poisson', 'absolute_error', 'squared_error', 'friedman_mse']
rfm = RandomizedSearchCV(normal_forest, parameters, cv=5, n_iter=30, n_jobs=-1, verbose=5, random_state=2)
rfm.fit(X_train, y_train)
rfm.best_score_

Fitting 5 folds for each of 30 candidates, totalling 150 fits


0.7116334436292081

In [51]:
rfmod=rfm.best_estimator_
rfmod

In [52]:
rfmod.fit(X_train,y_train)
ypred1=rfmod.predict(X_test)

In [54]:
ypred1_df = pd.DataFrame(ypred1, columns=y_test.columns)
get_ranges(ypred1_df), get_ranges(y_test)

(              NOx      PM 1        CO2  Pressure cylinder
 min     41.190529  0.684208  17.260089          55.213949
 max    394.387911  2.863192  98.698640         110.172741
 range  353.197382  2.178984  81.438550          54.958792,
               NOx      PM 1         CO2  Pressure cylinder
 min     17.977838  0.151139    2.891628          44.946302
 max    465.211587  4.095885  104.665828         107.390162
 range  447.233749  3.944747  101.774199          62.443860)

In [55]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [56]:
model_gb = MultiOutputRegressor(GradientBoostingRegressor(random_state=2))

model_gb.fit(X_train, y_train)
pred_gb = model_gb.predict(X_test)

In [58]:
pred_gb_df = pd.DataFrame(pred_gb, columns=y_test.columns)
pred_gb_df

Unnamed: 0,NOx,PM 1,CO2,Pressure cylinder
0,53.147647,1.229001,15.220903,54.275418
1,155.421566,3.125265,60.01274,74.845097
2,63.643263,1.533189,38.136944,61.084866
3,80.031135,2.523401,31.512994,62.568576
4,33.034279,1.056158,24.890362,55.811063
5,127.112519,2.253601,42.200113,65.357933
6,485.649814,1.184093,75.336411,107.853122
7,209.815524,2.790833,96.254564,105.25965
8,24.123863,0.27943,7.128551,50.02333
9,29.076389,0.265805,6.743109,50.222107


In [61]:
get_ranges(pred_gb_df), get_ranges(normal_forest_pred_df), get_ranges(y_test)

(              NOx      PM 1        CO2  Pressure cylinder
 min     24.123863  0.265805   6.743109          49.737686
 max    485.649814  3.125265  99.003045         109.599965
 range  461.525951  2.859459  92.259936          59.862279,
               NOx      PM 1         CO2  Pressure cylinder
 min     26.245347  0.324763    7.493821          50.985520
 max    445.250855  3.187267  105.734629         113.005537
 range  419.005509  2.862504   98.240808          62.020018,
               NOx      PM 1         CO2  Pressure cylinder
 min     17.977838  0.151139    2.891628          44.946302
 max    465.211587  4.095885  104.665828         107.390162
 range  447.233749  3.944747  101.774199          62.443860)