In [2]:
# Pandas is used for data manipulation
import pandas as pd

In [3]:
# Read in data and display first 5 rows
features = pd.read_csv('data.csv', sep=';')
features.head(5)

Unnamed: 0,weekday,preholiday,rainfall,temperature,manager,service,target,actual
0,5,0,0,0,1,3,56250,57292
1,4,0,0,0,1,1,56250,57222
2,3,0,0,-11,1,1,56250,58958
3,4,0,0,-3,1,1,56250,57014
4,3,1,2,1,4,2,56250,60625


In [4]:
print('The shape of our features is:', features.shape)


The shape of our features is: (43, 8)


In [5]:
# Descriptive statistics for each column
features.describe()

Unnamed: 0,weekday,preholiday,rainfall,temperature,manager,service,target,actual
count,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0
mean,3.093023,0.046512,1.697674,-7.186047,1.488372,1.27907,56250.0,57062.27907
std,1.460821,0.213083,2.712712,6.366788,1.0773,0.590624,0.0,1891.902121
min,1.0,0.0,0.0,-17.0,1.0,1.0,56250.0,53611.0
25%,2.0,0.0,0.0,-12.0,1.0,1.0,56250.0,55868.0
50%,3.0,0.0,0.0,-9.0,1.0,1.0,56250.0,57222.0
75%,4.0,0.0,2.0,-1.5,1.0,1.0,56250.0,58437.5
max,6.0,1.0,11.0,3.0,4.0,3.0,56250.0,60625.0


In [6]:
# One-hot encode the data using pandas get_dummies
features = pd.get_dummies(features)

In [7]:
# Display the first 5 rows of the last 12 columns
features.iloc[:,5:].head(5)

Unnamed: 0,service,target,actual
0,3,56250,57292
1,1,56250,57222
2,1,56250,58958
3,1,56250,57014
4,2,56250,60625


In [8]:
# Use numpy to convert to arrays
import numpy as np

# Labels are the values we want to predict
labels = np.array(features['actual'])

# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('actual', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

In [9]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.05, random_state = 42)

In [10]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)


Training Features Shape: (40, 7)
Training Labels Shape: (40,)
Testing Features Shape: (3, 7)
Testing Labels Shape: (3,)


In [11]:
# The baseline predictions are the historical averages
baseline_preds = test_features[:, feature_list.index('target')]
# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))


Average baseline error:  1180.33


In [12]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

In [13]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# =TIME(ROUNDDOWN(C3*24,0),ROUNDDOWN(C3*1440,0)-ROUNDDOWN(C3*24,0)*60,0)

import math

# math.floor(12.75)
# Returns: 12

def convertTimeValueToTime(timevalue):
    return '' + str(math.floor(timevalue/100000*24)) + ':' + str(math.floor(timevalue/100000*1440)-math.floor(timevalue/100000*24)*60).zfill(2)

for i in range(len(predictions)):
    print(test_features[i], convertTimeValueToTime(predictions[i]), '→', convertTimeValueToTime(labels[i]))

# print(test_features, predictions)
# print(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors)*24*60/100000, 0), 'minutes')

[    1     0     1    -7     1     1 56250] 13:48 → 13:45
[    4     0     1   -10     4     1 56250] 14:13 → 13:43
[    3     0     6    -4     1     1 56250] 13:34 → 14:08
Mean Absolute Error: 24.0 minutes


In [14]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 97.02 %.


In [15]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest
tree = rf.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
graph.write_png('tree.png')

In [16]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: temperature          Importance: 0.33
Variable: manager              Importance: 0.24
Variable: weekday              Importance: 0.15
Variable: rainfall             Importance: 0.14
Variable: service              Importance: 0.13
Variable: preholiday           Importance: 0.01
Variable: target               Importance: 0.0


In [17]:
print(test_features)

[[    1     0     1    -7     1     1 56250]
 [    4     0     1   -10     4     1 56250]
 [    3     0     6    -4     1     1 56250]]


In [18]:

today = [[1,0,10,3, 1,1, 56250]]
prediction = rf.predict(today)

for i in range(len(prediction)):
    print(today[i], convertTimeValueToTime(prediction[i]), )

[1, 0, 10, 3, 1, 1, 56250] 13:20
