In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
from datetime import datetime, timedelta, date
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.ensemble import GradientBoostingRegressor
from sklearn import linear_model
from sklearn.metrics import r2_score, mean_squared_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [3]:
import os
import functools as ft
files = os.listdir('../Dataset/meta_data_date')
dfs=[]
for file in files:
    dftmp = pd.read_csv('../Dataset/meta_data_date'+'/'+file)
    dftmp = dftmp[['time','REPORT_ID','DISTANCE_IN_METERS','vehicleCount']]
    dfs.append(dftmp)
df = pd.concat(dfs)

In [4]:
#create lag data
lag_df = df.copy()
for i in range(1,3):
    lag_df['vehicleCount_lag_'+str(i)] = lag_df.groupby(['REPORT_ID'])['vehicleCount'].shift(i)
# lag_df = lag_df.dropna()
# lag_df = lag_df.reset_index(drop=True)

In [5]:
lag_df.head(20)

Unnamed: 0,time,REPORT_ID,DISTANCE_IN_METERS,vehicleCount,vehicleCount_lag_1,vehicleCount_lag_2
0,7,158895,1505,10,,
1,8,158895,1505,72,10.0,
2,9,158895,1505,84,72.0,10.0
3,10,158895,1505,75,84.0,72.0
4,11,158895,1505,71,75.0,84.0
5,12,158895,1505,76,71.0,75.0
6,13,158895,1505,61,76.0,71.0
7,14,158895,1505,59,61.0,76.0
8,15,158895,1505,29,59.0,61.0
9,16,158895,1505,26,29.0,59.0


In [8]:
lag_df = lag_df.dropna()

In [9]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, var_red=None, value=None):
        ''' constructor ''' 
        
        # for decision node
        
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.var_red = var_red
        
        # for leaf node
        self.value = value

In [10]:
class MyDecisionTreeRegressor():
    def __init__(self, min_samples_split=2, max_depth=2):
        ''' constructor '''
        
        # initialize the root of the tree 
        self.root = None
        
        # stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree '''
        
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        best_split = {}
        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["var_red"]>0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["var_red"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_var_red = -float("inf")
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_var_red = self.variance_reduction(y, left_y, right_y)
                    # update the best split if needed
                    if curr_var_red>max_var_red:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["var_red"] = curr_var_red
                        max_var_red = curr_var_red
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def variance_reduction(self, parent, l_child, r_child):
        ''' function to compute variance reduction '''
        
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
        return reduction
    
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        
        val = np.mean(Y)
        return val
                
    def print_tree(self, tree=None, indent=" "):
        ''' function to print the tree '''
        
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.var_red)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)
    
    def fit(self, X, Y):
        ''' function to train the tree '''
        print('X: ', X)
        print('Y: ', Y)
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
        
    def make_prediction(self, x, tree):
        ''' function to predict new dataset '''
        
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)
    
    def predict(self, X):
        ''' function to predict a single data point '''
        
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions

In [12]:
X = lag_df.iloc[:, :-1].values
Y = lag_df.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)

In [13]:
regressor = MyDecisionTreeRegressor(min_samples_split=3, max_depth=3)
regressor.fit(X_train,Y_train)

X:  [[1.60000e+01 2.09748e+05 1.07600e+03 0.00000e+00 2.00000e+00]
 [1.10000e+01 1.80952e+05 8.01000e+02 7.00000e+00 1.10000e+01]
 [1.40000e+01 1.95339e+05 8.00000e+02 0.00000e+00 2.00000e+00]
 ...
 [1.80000e+01 1.97355e+05 9.34000e+02 0.00000e+00 0.00000e+00]
 [1.00000e+01 1.85210e+05 7.46000e+02 5.30000e+01 3.40000e+01]
 [0.00000e+00 1.97572e+05 5.60000e+02 0.00000e+00 0.00000e+00]]
Y:  [[ 2.]
 [22.]
 [ 4.]
 ...
 [ 3.]
 [37.]
 [ 2.]]


In [14]:
Y_pred = regressor.predict(X_test) 
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(Y_test, Y_pred))

24.201732582795362

In [15]:
from sklearn.metrics import r2_score    
np.sqrt(r2_score(Y_test, Y_pred))

0.9185625473175068

In [16]:
#save the model to disk
import pickle
filename = 'MyDecisionTreeRegressorModel.sav'
pickle.dump(regressor, open(filename, 'wb'))