In [1]:
# Importing packages

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import os
from sklearn.tree import export_graphviz
import pydot
from prettytable import PrettyTable

In [2]:
features = pd.read_csv('Seattle Temperatures 2016.csv')
features.head(5)
#print('The shape of our features is:', features.shape)

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend
0,2016,1,1,Fri,45,45,45.6,45,43,50,44,29
1,2016,1,2,Sat,44,45,45.7,44,41,50,44,61
2,2016,1,3,Sun,45,44,45.8,41,43,46,47,56
3,2016,1,4,Mon,44,41,45.9,40,44,48,46,53
4,2016,1,5,Tues,41,40,46.0,44,46,46,46,41


In [3]:
features.describe()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0
mean,2016.0,6.477011,15.514368,62.652299,62.701149,59.760632,62.543103,57.238506,62.373563,59.772989,60.034483
std,0.0,3.49838,8.772982,12.165398,12.120542,10.527306,11.794146,10.605746,10.549381,10.705256,15.626179
min,2016.0,1.0,1.0,35.0,35.0,45.1,35.0,41.0,46.0,44.0,28.0
25%,2016.0,3.0,8.0,54.0,54.0,49.975,54.0,48.0,53.0,50.0,47.75
50%,2016.0,6.0,15.0,62.5,62.5,58.2,62.5,56.0,61.0,58.0,60.0
75%,2016.0,10.0,23.0,71.0,71.0,69.025,71.0,66.0,72.0,69.0,71.0
max,2016.0,12.0,31.0,117.0,117.0,77.4,92.0,77.0,82.0,79.0,95.0


In [4]:
# One-hot encode categorical features (only one is 'week' feature, which is day of the week)
features = pd.get_dummies(features)
features.head(5)

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2016,1,1,45,45,45.6,45,43,50,44,29,1,0,0,0,0,0,0
1,2016,1,2,44,45,45.7,44,41,50,44,61,0,0,1,0,0,0,0
2,2016,1,3,45,44,45.8,41,43,46,47,56,0,0,0,1,0,0,0
3,2016,1,4,44,41,45.9,40,44,48,46,53,0,1,0,0,0,0,0
4,2016,1,5,41,40,46.0,44,46,46,46,41,0,0,0,0,0,1,0


In [5]:
# Labels are the values we want to predict; numpy array contains integer values
labels = np.array(features['actual'])

In [6]:
# Remove the labels from the features
# axis 1 refers to the columns
features = features.drop('actual', axis = 1)

In [7]:
# Saving feature names for later use
feature_list = list(features.columns)

In [8]:
# Convert to numpy array
features = np.array(features)

In [9]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25,
                                                                           random_state = 42)

In [10]:
# The baseline predictions are the historical averages
baseline_preds = test_features[:, feature_list.index('average')]

# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - test_labels)
print('Average baseline error: %.2f degrees.') % round(np.mean(baseline_errors), 2)

Average baseline error: 5.06 degrees.


In [11]:
# Instantiate model 
rf = RandomForestRegressor(n_estimators= 1000, random_state=42)

# Train the model on training data
rf.fit(train_features, train_labels);

In [12]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error: %.2f degrees.') % round(np.mean(errors), 2)

Mean Absolute Error: 3.87 degrees.


In [13]:
# Calculate mean absolute percentage error (MAPE)
mape = 100.0 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy: %.2f %%.') % round(accuracy, 2),

Accuracy: 93.94 %.


In [14]:
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

# Pull out one tree from the forest
tree = rf.estimators_[5]

# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)

# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')

# Write graph to a png file
graph.write_png('tree.png')

In [15]:
print('The depth of this tree is: %d') % int(tree.tree_.max_depth)

The depth of this tree is: 13


In [16]:
# Limit depth of tree to 2 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3, random_state=42)
rf_small.fit(train_features, train_labels)

# Extract the small tree
tree_small = rf_small.estimators_[5]

# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)

(graph, ) = pydot.graph_from_dot_file('small_tree.dot')

graph.write_png('small_tree.png')

In [17]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

variables = [variables[0] for variables in feature_importances]
importances = [importances[1] for importances in feature_importances]

t = PrettyTable(['Variables', 'Importances'])
for index in range(len(variables)):
    t.add_row([variables[index], importances[index]])
print(t)

+----------------+-------------+
|   Variables    | Importances |
+----------------+-------------+
|     temp_1     |     0.66    |
|    average     |     0.15    |
| forecast_noaa  |     0.05    |
|  forecast_acc  |     0.03    |
|      day       |     0.02    |
|     temp_2     |     0.02    |
| forecast_under |     0.02    |
|     friend     |     0.02    |
|     month      |     0.01    |
|      year      |     0.0     |
|    week_Fri    |     0.0     |
|    week_Mon    |     0.0     |
|    week_Sat    |     0.0     |
|    week_Sun    |     0.0     |
|   week_Thurs   |     0.0     |
|   week_Tues    |     0.0     |
|    week_Wed    |     0.0     |
+----------------+-------------+
