In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

### Dataset

In [2]:
# Importing the dataset
dataset = pd.read_csv('data.csv')

In [3]:
dataset.head(5)

Unnamed: 0,number_people,timestamp,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour
0,37,61211,4,0,0,71.76,0,0,8,17
1,45,62414,4,0,0,71.76,0,0,8,17
2,40,63015,4,0,0,71.76,0,0,8,17
3,44,63616,4,0,0,71.76,0,0,8,17
4,45,64217,4,0,0,71.76,0,0,8,17


### Independent and dependent variables

In [4]:
data = dataset.values
X = data[:, 1:]  # all rows, no label
y = data[:, 0]  # all rows, label only

### Train test split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [6]:
# Scale the data to be between -1 and 1
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_train[:3]

array([[-1.47233467,  1.51458555,  1.59547406, -0.05014971, -0.24184652,
        -0.29182161,  0.71531015,  1.03592992, -1.52296989],
       [-0.17289197,  0.51147213, -0.62677296, -0.05014971, -1.29968502,
        -0.29182161,  0.71531015, -1.57891942, -0.18395637],
       [ 1.42857488,  1.51458555,  1.59547406, -0.05014971,  0.73839605,
        -0.29182161,  0.71531015,  0.74539111,  1.45261571]])

### Fit the RandomForest model

In [7]:
# creating the model
model = RandomForestRegressor()
#rfrmodel = RandomForestRegressor(n_estimators = 40, max_depth = 4, n_jobs = -1)

# feeding the training data to the model
model.fit(X_train, y_train)

RandomForestRegressor()

### Making predictions

In [9]:
y_pred = model.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred.round(2)})
df

Unnamed: 0,Real Values,Predicted Values
0,1.0,0.78
1,27.0,42.35
2,26.0,24.13
3,71.0,58.37
4,0.0,0.38
...,...,...
15541,2.0,1.62
15542,66.0,61.38
15543,47.0,41.88
15544,11.0,11.64


### Metrics for model performance

In [10]:
from sklearn.metrics import r2_score
print('Mean absolute error: %.2f' % np.mean(np.absolute(y_test - y_pred)))
print('Residual sum of squares (MSE): %.2f' % np.mean((y_test - y_pred) **2 ))
print('R2-score: %.2f' % r2_score(y_test, y_pred))
rmse = np.sqrt(np.mean((y_test - y_pred) **2))
print("RMSE value: {:.2f}".format(rmse))

Mean absolute error: 4.41
Residual sum of squares (MSE): 43.86
R2-score: 0.91
RMSE value: 6.62
