# Random Forest Regressor

While trying out the neural networks we decided to try out on the side this random forest algorithm that we saw in class, to see if it gave us any better results, and it did not let us down. 

### 1. Imports

In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

### 2. Data

The data was not normalised at the algorythm works best with random values.

In [None]:
train_df = pd.read_csv("Data/train_set.csv")
val_df = pd.read_csv("Data/val_set.csv")
test_df = pd.read_csv("Data/test_set.csv")

In [None]:
y_col = 'sat1_col'
X_col = list(train_df.columns)
X_col.remove(y_col)

In [None]:
X_train = train_df[X_col]
y_train = train_df[y_col]
X_val = val_df[X_col]
y_val = val_df[y_col]


### 3. First attempt

The lines of the code are pretty simple, what you see below is all we needed. 

In [None]:
rg = RandomForestRegressor()

In [None]:
rg.fit(X_train, y_train)
y_pred = rg.predict(X_val)
mean_squared_error(y_val, y_pred)

0.15858289797333333

In [None]:
y_pred_test = rg.predict(test_df)

In [None]:
df_test = pd.DataFrame(y_pred_test, columns = [y_col])

In [None]:
df_test.to_csv(f"submission_RandomForestRegressor.csv")

### 4. Loops on different hyperparameters

We ran a few loops to get a better understanding of how the hyperparameters worked.

In [None]:
n_estimators = [100, 150, 200, 250]
min_samples_split = [2, 5, 10, 15, 20]
min_samples_leaf = [1, 2, 5, 10, 15]

for n in n_estimators:
    print(f"n_estimators {n}")
    
    for s in min_samples_split:
        print(f"min_samples_split {s}")
        
        for l in min_samples_leaf:
            print(f"min_samples_leaf {l}")
            rg = RandomForestRegressor()
            rg.fit(X_train, y_train)
            y_pred = rg.predict(X_val)
            print(mean_squared_error(y_val, y_pred))
            
            

n_estimators 100
min_samples_split 2
min_samples_leaf 1
0.1564254356466667
min_samples_leaf 2
0.15850893312666667
min_samples_leaf 5
0.1576008927
min_samples_leaf 10
0.15809342262
min_samples_leaf 15
0.15830940366666665
min_samples_split 5
min_samples_leaf 1
0.15561293918000002
min_samples_leaf 2
0.15792479882000002
min_samples_leaf 5
0.15672262444666668
min_samples_leaf 10
0.16047825062666668
min_samples_leaf 15
0.15737699965333335
min_samples_split 10
min_samples_leaf 1
0.1601695434333333
min_samples_leaf 2
0.15747908258666668
min_samples_leaf 5
0.15784565166666667
min_samples_leaf 10
0.15561664305999998
min_samples_leaf 15
0.15760014868000002
min_samples_split 15
min_samples_leaf 1
0.15746965688000003
min_samples_leaf 2
0.15517196216666665
min_samples_leaf 5
0.15884902788000002
min_samples_leaf 10
0.15817159333333333
min_samples_leaf 15
0.1576528558266667
min_samples_split 20
min_samples_leaf 1
0.15646207898666667
min_samples_leaf 2
0.15480792163333332
min_samples_leaf 5
0.157762817

KeyboardInterrupt: 

After tuning these three hyperparameters we tried some combinations and one of them gave us our best score.

In [None]:
n_estimators =  100
min_samples_split = 18
min_samples_leaf  = 5
#0.15517196216666665

rg = RandomForestRegressor()
            
rg.fit(X_train, y_train)

y_pred = rg.predict(X_val)
print(mean_squared_error(y_val, y_pred))
y_pred = rg.predict(X_train)
print(mean_squared_error(y_train, y_pred))

0.15587586209333332
0.021684604683694448


In [None]:
df_test_6 = rg.predict(test_df)
df_test_6 = pd.DataFrame(df_test_6, columns = [y_col])

### 5. Final code

Our final code looked like what you see below and we mostly worked on these five hyperparameters: n_estimators, max_features, min_samples_split, min_samples_leaf and random_state.

We just kept on tuning these hyperparameters to get the best losses, but in the end we observed that when it came to submitting the test prediction it only performed at best with a MSE loss of 0.156 and we weren't able to go lower than that.

In [None]:
rg = RandomForestRegressor(n_estimators=150, max_features = 60 , min_samples_split=15, min_samples_leaf=14, random_state = 15 )
rg.fit(X_train, y_train)
y_pred = rg.predict(X_val)
print(mean_squared_error(y_val, y_pred))
y_pred = rg.predict(X_train)
print(mean_squared_error(y_train, y_pred))

0.15172984433790218
0.10156318719784832


In [None]:
df_test_27 = pd.DataFrame(rg.predict(test_df), columns = [y_col])
df_test_27.to_csv(f"Not_RandomForestRegressor_27.csv")

In [None]:
rg = RandomForestRegressor(n_estimators=350, max_features = 70 , min_samples_split=15, min_samples_leaf=11, random_state = 15 )
rg.fit(X_train, y_train)
y_pred = rg.predict(X_val)
print(mean_squared_error(y_val, y_pred))
y_pred = rg.predict(X_train)
print(mean_squared_error(y_train, y_pred))

0.15143310677835078
0.09040294558366802


In [None]:
df_test_35 = pd.DataFrame(np.around(rg.predict(test_df), decimals=10), columns = [y_col])
#df_test_35.to_csv(f"Not_RandomForestRegressor_35_rounded.csv")

In [None]:
rg = RandomForestRegressor(n_estimators=340, max_features = 70 , min_samples_split=15, min_samples_leaf=11, random_state = 15 )
rg.fit(X_train, y_train)
y_pred = rg.predict(X_val)
print(mean_squared_error(y_val, y_pred))
y_pred = rg.predict(X_train)
print(mean_squared_error(y_train, y_pred))

0.15152323997536835
0.09042741539157478


In [None]:
rg = RandomForestRegressor(n_estimators=500, max_features = 70 , min_samples_split=15, min_samples_leaf=11, random_state = 15 )
rg.fit(X_train, y_train)
y_pred = rg.predict(X_val)
print(mean_squared_error(y_val, y_pred))
y_pred = rg.predict(X_train)
print(mean_squared_error(y_train, y_pred))

0.15135683238446748
0.09030096343694967


In [None]:
df_test_36 = pd.DataFrame(np.around(rg.predict(test_df), decimals=10), columns = [y_col])
df_test_36.to_csv(f"Not_RandomForestRegressor_36_rounded.csv")

In [None]:
rg = RandomForestRegressor(n_estimators=600, max_features = 70 , min_samples_split=15, min_samples_leaf=10, random_state = 15 )
rg.fit(X_train, y_train)
y_pred = rg.predict(X_val)
print(mean_squared_error(y_val, y_pred))
y_pred = rg.predict(X_train)
print(mean_squared_error(y_train, y_pred))

0.15152308250854624
0.0859371486452157


In [None]:
df_test_37 = pd.DataFrame(np.around(rg.predict(test_df), decimals=10), columns = [y_col])
df_test_37.to_csv(f"Not_RandomForestRegressor_37_rounded.csv")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=493ee647-e437-4c81-80f8-96d4eefd9c39' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>