In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import altair as alt

In [0]:
file_url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter09/Dataset/phpYYZ4Qc.csv'

In [0]:
df = pd.read_csv(file_url)

In [0]:
y = df.pop('rej')
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3, random_state=1)

In [0]:
rf_model = RandomForestRegressor(random_state=1, n_estimators=50, max_depth=6, min_samples_leaf=60)

In [7]:
rf_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=6, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=60,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=50, n_jobs=None, oob_score=False,
                      random_state=1, verbose=0, warm_start=False)

In [0]:
preds_train = rf_model.predict(X_train)
preds_test = rf_model.predict(X_test)

In [9]:
train_mse = mean_squared_error(y_train, preds_train)
train_mse

0.007315982781336234

We achieved quite a low MSE score on the training set.

In [10]:
test_mse = mean_squared_error(y_test, preds_test)
test_mse

0.007489642004973965

We also have a low MSE score on the testing set that is very similar to the training one. So, our model is not overfitting.

In [11]:
rf_model.feature_importances_

array([0.00000000e+00, 7.56405224e-04, 8.89442010e-05, 9.46275333e-04,
       4.08153931e-05, 1.97210546e-01, 5.03587073e-04, 2.31999967e-04,
       6.15222081e-03, 3.52461267e-03, 0.00000000e+00, 5.69504288e-01,
       1.13616416e-04, 4.90638284e-04, 1.87909452e-04, 3.20591202e-04,
       2.12958787e-04, 1.90764978e-01, 5.75581836e-03, 4.67864791e-04,
       0.00000000e+00, 0.00000000e+00, 1.75187909e-02, 3.51906210e-04,
       4.85916389e-04, 2.89740583e-05, 1.27170564e-03, 1.12059338e-03,
       1.97954549e-04, 3.01220348e-04, 0.00000000e+00, 1.44886927e-03])

In [0]:
varimp_df = pd.DataFrame()

In [0]:
varimp_df['feature'] = df.columns
varimp_df['importance'] = rf_model.feature_importances_

In [16]:
varimp_df.head()

Unnamed: 0,feature,importance
0,a1cx,0.0
1,a1cy,0.000756
2,a1sx,8.9e-05
3,a1sy,0.000946
4,a1rho,4.1e-05


In [17]:
alt.Chart(varimp_df).mark_bar().encode(
    x='importance',
    y="feature"
)

**From this output, we can see the variables that impact the prediction the most for this Random Forest model are a2pop, a1pop, a3pop, b1eff, and temp, by decreasing order of importance.**