# Importing Libraries

In [28]:
import pandas as pd   
import numpy as np    
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
%matplotlib inline

In [29]:
pip install pydot

Note: you may need to restart the kernel to use updated packages.


In [30]:
import pydot

# Data Exploration

In [31]:
df = pd.read_csv("../Random_forest/temps.csv")

In [32]:
df.head(5)


Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual,friend
0,2019,1,1,Fri,45,45,45.6,45,29
1,2019,1,2,Sat,44,45,45.7,44,61
2,2019,1,3,Sun,45,44,45.8,41,56
3,2019,1,4,Mon,44,41,45.9,40,53
4,2019,1,5,Tues,41,40,46.0,44,41


In [33]:
df.shape

(348, 9)

In [34]:
df.columns

Index(['year', 'month', 'day', 'week', 'temp_2', 'temp_1', 'average', 'actual',
       'friend'],
      dtype='object')

In [35]:
df.isnull().sum()

year       0
month      0
day        0
week       0
temp_2     0
temp_1     0
average    0
actual     0
friend     0
dtype: int64

In [36]:
# One-hot encode categorical features
df = pd.get_dummies(df)
df.head(5)


Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2019,1,1,45,45,45.6,45,29,1,0,0,0,0,0,0
1,2019,1,2,44,45,45.7,44,61,0,0,1,0,0,0,0
2,2019,1,3,45,44,45.8,41,56,0,0,0,1,0,0,0
3,2019,1,4,44,41,45.9,40,53,0,1,0,0,0,0,0
4,2019,1,5,41,40,46.0,44,41,0,0,0,0,0,1,0


# Features and Labels

In [37]:
#Labels are the values we want to predict
labels = df['actual']

#Remove the labels from the features
df = df.drop('actual', axis = 1)

#Saving feature names for later use
feature_list = list(df.columns)

# Train Test Split

In [38]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(df, labels, test_size = 0.20, random_state = 42)



In [39]:
print (len(train_features))
print (len(test_features))


278
70


In [40]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (278, 14)
Training Labels Shape: (278,)
Testing Features Shape: (70, 14)
Testing Labels Shape: (70,)


# Training the Forest

In [41]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators= 1000, random_state=42)
rf.fit(train_features, train_labels);

# Make Predictions on Test Data


In [42]:
predictions = rf.predict(test_features)
errors = abs(predictions - test_labels)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 3.78 degrees.


In [43]:
mape = 100 * (errors / test_labels)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 94.02 %.


# Visualizing a Single Decision Tree


In [46]:
# Pull out one tree from the forest
tree = rf.estimators_[5]

# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)

# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')

# Write graph to a png file
graph.write_png('tree.png'); 

In [45]:
print('The depth of this tree is:', tree.tree_.max_depth)


The depth of this tree is: 13
