In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import os
import sys
import shutil
import urllib.request
import zipfile
import warnings
import smtplib
import re
from datetime import datetime
from email.message import EmailMessage
from pandas.plotting import scatter_matrix
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek


from sklearn.ensemble import RandomForestRegressor

  from numpy.core.umath_tests import inner1d


In [2]:
traffic_df = pd.read_csv("traffic_data.csv")

In [3]:
#Extracting features from date_time variable
traffic_df['date_time'] = pd.to_datetime(traffic_df.date_time)
traffic_df['weekday'] = traffic_df.date_time.dt.weekday
traffic_df['date'] = traffic_df.date_time.dt.date
traffic_df['hour'] = traffic_df.date_time.dt.hour
traffic_df['month'] = traffic_df.date_time.dt.month
#Monday is 0 and Sunday is 6

In [4]:
#Other holidays are very sparse compared to none holidays. 
#Hence encoding the holidays as TRUE and none Holidays as FALSE

def any_holiday(x):
    if x == 'None':
        return 0
    else:
        return 1
traffic_df['holiday'] = traffic_df['holiday'].map(any_holiday)

In [5]:
#The weather_description column contains  mostly describes cloudy, clear, rain, snow, thunderstorms, fog, mist and haze. 
# Create following new columns:
#clear - where weather description contains clear is True, else False 
#cloudy - where weather description contains clouds is True, else False
#rainstorm - where weather description contains thunderstorm and rain is True, else False
#fog - True where weather description contains fog else False
#mist - True where weather description contains mist else False
#haze - True where weather description contains haze else False

# First make to lowercase
traffic_df['weather_description'] = traffic_df['weather_description'].map(lambda x:x.lower())

#Any row containing "thunderstorm" is replaced by "thunderstorm"
traffic_df.loc[traffic_df['weather_description'].str.contains('thunderstorm'),'weather_description'] = 'rainstorm'
traffic_df.loc[traffic_df['weather_description'].str.contains('rain') & ~traffic_df['weather_description'].str.contains('snow'),'weather_description'] = 'rainstorm'
traffic_df.loc[traffic_df['weather_description'].str.contains('clear'),'weather_description'] = 'clear'
traffic_df.loc[traffic_df['weather_description'].str.contains('clouds'),'weather_description'] = 'cloudy'

In [6]:
weather = ['rainstorm','mist','fog','haze', 'cloudy', 'clear']
traffic_df.loc[np.logical_not(traffic_df['weather_description'].isin(weather)),'weather_description'] = 'other'

In [7]:
# Apply feature scaling techniques

traffic_df['temp'] = preprocessing.minmax_scale(traffic_df['temp'])
traffic_df['rain_1h'] = preprocessing.minmax_scale(traffic_df['rain_1h'])
traffic_df['snow_1h'] = preprocessing.minmax_scale(traffic_df['snow_1h'])
traffic_df['clouds_all'] = preprocessing.minmax_scale(traffic_df['clouds_all'])



In [8]:
#creating dummy variables for these newly created categories in weather description
traffic_df = pd.get_dummies(columns=['weather_description', 'hour', 'month', 'weekday'],data=traffic_df)

In [9]:
# Drop the un-needed features
traffic_df['targetVar'] = traffic_df['traffic_volume']
traffic_df = traffic_df.drop(['date_time', 'date', 'traffic_volume', 'weather_main'], axis=1)

In [10]:
# Use variable totCol to hold the number of columns in the dataframe
totCol = len(traffic_df.columns)

# Set up variable totAttr for the total number of attribute columns
totAttr = totCol-1

In [11]:
# targetCol variable indicates the column location of the target/class variable
# If the first column, set targetCol to 1. If the last column, set targetCol to totCol
# If (targetCol <> 1) and (targetCol <> totCol), be aware when slicing up the dataframes for visualization
targetCol = totCol

In [12]:
# We create attribute-only and target-only datasets (X_original and y_original) for the modeling

if targetCol == totCol:
    x_orig = traffic_df.iloc[:,0:totAttr]
    y_orig = traffic_df.iloc[:,totAttr]
else:
    x_orig = traffic_df.iloc[:,1:totCol]
    y_orig = traffic_df.iloc[:,0]

print("traffic_df.shape: {} x_orig.shape: {} y_orig.shape: {}".format(traffic_df.shape, x_orig.shape, y_orig.shape))

traffic_df.shape: (8573, 56) x_orig.shape: (8573, 55) y_orig.shape: (8573,)


In [13]:
# Create the random seed number for reproducible results
seedNum = 567

# Set up the number of CPU cores available for multi-thread processing
cpu_num = 6


# Run algorithms using 10-fold cross validation
num_folds = 10
scoring = 'neg_mean_squared_error'

In [14]:
# Use 75% of the data to train the models and the remaining for testing/validation

testdata_size = 0.25
x_train_df, x_test_df, y_train_df, y_test_df = train_test_split(x_orig, y_orig, test_size=testdata_size, random_state=seedNum)
print("x_train_df.shape: {} y_train_df.shape: {}".format(x_train_df.shape, y_train_df.shape))
print("x_test_df.shape: {} y_test_df.shape: {}".format(x_test_df.shape, y_test_df.shape))

x_train_df.shape: (6429, 55) y_train_df.shape: (6429,)
x_test_df.shape: (2144, 55) y_test_df.shape: (2144,)


In [15]:
# We finalize the training and testing datasets for the modeling activities
x_train = x_train_df.values
y_train = y_train_df.values
x_test = x_test_df.values
y_test = y_test_df.values
print("x_train.shape: {} y_train.shape: {}".format(x_train.shape, y_train.shape))
print("x_test.shape: {} y_test.shape: {}".format(x_test.shape, y_test.shape))

x_train.shape: (6429, 55) y_train.shape: (6429,)
x_test.shape: (2144, 55) y_test.shape: (2144,)


In [18]:
# Tuning algorithm - Random Forest
results = []
names = []
startTimeModule = datetime.now()
paramGrid1 = dict(n_estimators=np.array([400, 500, 600, 700, 800]))
model1 = RandomForestRegressor(n_jobs=cpu_num)
kfold = KFold(n_splits=num_folds, random_state=seedNum)
grid1 = GridSearchCV(estimator=model1, param_grid=paramGrid1, scoring=scoring, cv=kfold)
grid_result1 = grid1.fit(x_train, y_train)

print("Best: %f using %s" % (grid_result1.best_score_, grid_result1.best_params_))
results.append(grid_result1.cv_results_['mean_test_score'])
names.append('RF')
means = grid_result1.cv_results_['mean_test_score']
stds = grid_result1.cv_results_['std_test_score']
params = grid_result1.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
print ('Best RMSE for the Model is:', math.sqrt((grid_result1.best_score_*-1)))
print ('Model training time:',(datetime.now() - startTimeModule))

Best: -220862.468328 using {'n_estimators': 400}
-220862.468328 (51082.825358) with: {'n_estimators': 400}
-221348.861397 (51122.278256) with: {'n_estimators': 500}
-221589.135171 (51260.585114) with: {'n_estimators': 600}
-221163.141190 (51723.504243) with: {'n_estimators': 700}
-221470.097829 (51674.844952) with: {'n_estimators': 800}
Best RMSE for the Model is: 469.96007099331104
Model training time: 0:09:37.535190


In [21]:
traffic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8573 entries, 0 to 8572
Data columns (total 57 columns):
holiday                          8573 non-null int64
temp                             8573 non-null float64
rain_1h                          8573 non-null float64
snow_1h                          8573 non-null float64
clouds_all                       8573 non-null float64
weather_main                     8573 non-null object
weather_description_clear        8573 non-null uint8
weather_description_cloudy       8573 non-null uint8
weather_description_fog          8573 non-null uint8
weather_description_haze         8573 non-null uint8
weather_description_mist         8573 non-null uint8
weather_description_other        8573 non-null uint8
weather_description_rainstorm    8573 non-null uint8
hour_0                           8573 non-null uint8
hour_1                           8573 non-null uint8
hour_2                           8573 non-null uint8
hour_3                           857