In [10]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [11]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
from datetime import datetime, timedelta, date
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn import linear_model
#import bayes regression
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [12]:
import os
import functools as ft
files = os.listdir('../Dataset/weather_merged_2')
dfs=[]
for file in files:
    dftmp = pd.read_csv('../Dataset/weather_merged_2'+'/'+file)
    # dfs.append(dftmp.sample(n=5000))
    dftmp.drop('year', axis=1, inplace=True)
    dftmp.drop('month', axis=1, inplace=True)
    # dftmp.drop('REPORT_ID', axis=1, inplace=True)
    dfs.append(dftmp)
# df = pd.concat(dfs)

In [13]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, var_red=None, value=None):
        ''' constructor ''' 
        
        # for decision node
        
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.var_red = var_red
        
        # for leaf node
        self.value = value

In [14]:
class MyDecisionTreeRegressor():
    def __init__(self, min_samples_split=2, max_depth=2):
        ''' constructor '''
        
        # initialize the root of the tree 
        self.root = None
        
        # stopping conditions
        self.min_samples_split = min_samples_split  # minimum number of samples to split a node
        self.max_depth = max_depth  # maximum depth of the tree
        
    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree '''
        
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        best_split = {}
        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["var_red"]>0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["var_red"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_var_red = -float("inf")
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_var_red = self.variance_reduction(y, left_y, right_y)
                    # update the best split if needed
                    if curr_var_red>max_var_red:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["var_red"] = curr_var_red
                        max_var_red = curr_var_red
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def variance_reduction(self, parent, l_child, r_child):
        ''' function to compute variance reduction '''
        
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
        return reduction
    
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        
        val = np.mean(Y)
        return val
                
    def print_tree(self, tree=None, indent=" "):
        ''' function to print the tree '''
        
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.var_red)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)
    
    def fit(self, X, Y):
        ''' function to train the tree '''
        print('X: ', X)
        print('Y: ', Y)
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
        
    def make_prediction(self, x, tree):
        ''' function to predict new dataset '''
        
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)
    
    def predict(self, X):
        ''' function to predict a single data point '''
        
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions

In [15]:
def make_metrics(models):
    data = {
        'name': [model.name for model in models[1:]],
        'r2': [model.r2 for model in models[1:]],
        'rmse': [model.rmse for model in models[1:]]
    }
    data['name'] = 'average R2 and sum RMSE'
    data['r2'].append(np.mean(data['r2']))
    data['rmse'].append(np.sum(data['rmse']))
    return pd.DataFrame(data)

In [16]:
class Model:
  def __init__(self, name, data, predict_features, test_size, ml_model):
    self.name = name
    self.data = data
    self.predict_features = predict_features
    self.is_trained = False
    self.test_size = test_size
    self.ml_model = ml_model
    self.do_things()

  def cal_rmse(self):
    self.rmse = mean_squared_error(self.ytest, self.ypredict, squared=False)
    return self.rmse

  def prequisite(self, test_size):
    self.features = [i for i in self.data.columns if i != self.predict_features]
    self.X = self.data[self.features].values
    self.y = self.data[self.predict_features].values
    self.Xtrain, self.Xtest, self.ytrain, self.ytest = train_test_split(self.X, self.y, test_size=test_size)
    return None

  def fit(self):
    self.is_trained = True
    self.ml_model.fit(self.Xtrain, self.ytrain)
    self.ypredict = self.ml_model.predict(self.Xtest)
    return self.ml_model

  def cal_r2_score(self):
    self.r2 = r2_score(self.ytest, self.ypredict)
    return self.r2

  def do_things(self) -> None:
    self.prequisite(self.test_size)
    self.fit()
    self.cal_rmse()
    self.cal_r2_score()
    return None

  def __repr__(self) -> str:
    if not self.is_trained:
      return f'<{self.name}> (is not trained yet)>'
    return f'<({self.name}: [R² Score: {self.r2}], [RMSE: {self.rmse}])>'

In [17]:
def getreport_id (df):
    return df['REPORT_ID'].unique()

In [18]:
models = [None]
# list_report_id = getreport_id(df)
for i in range(0,len(dfs)):
    report_id = dfs[i]['REPORT_ID'][0]
    models += [
        Model(
            # ml_model=linear_model.LinearRegression(),
            ml_model=DecisionTreeRegressor(),   
            name=f'Dataset of report {i}',
            data=dfs[i],
            predict_features='vehicleCount',
            test_size=1/4
        )
    ]

make_metrics(models)

Unnamed: 0,name,r2,rmse
0,average R2 and sum RMSE,0.741502,24.534172
1,average R2 and sum RMSE,0.664961,30.246827
2,average R2 and sum RMSE,0.733655,19.354459
3,average R2 and sum RMSE,0.602998,19.347936
4,average R2 and sum RMSE,0.818112,70.034564
...,...,...,...
445,average R2 and sum RMSE,0.304433,2.833347
446,average R2 and sum RMSE,0.096307,2.651601
447,average R2 and sum RMSE,-2.166849,3.775293
448,average R2 and sum RMSE,-0.127550,2.437652


In [19]:
#make histogram
def make_histogram(df, feature):
    df['r2']= df['r2'].apply(lambda x: x*100)
    df = df[df['r2']>0]
    plt.figure(figsize=(12, 6))
    plt.hist(df[feature], bins=100)
    plt.title(f'Histogram of {feature}', fontsize=20)
    plt.xlabel(feature, fontsize=15)
    plt.ylabel('Frequency', fontsize=15)
    plt.show()

In [20]:
# make_histogram(make_metrics(models), 'r2')

In [21]:
df_predict= make_metrics(models)

In [22]:
df_predict.sort_values(by=['r2'], inplace=True, ascending=False)
df_predict.head(30)

Unnamed: 0,name,r2,rmse
332,average R2 and sum RMSE,0.892148,17.085163
46,average R2 and sum RMSE,0.871042,18.936558
10,average R2 and sum RMSE,0.843315,46.88862
217,average R2 and sum RMSE,0.831041,37.09641
237,average R2 and sum RMSE,0.822032,50.938682
271,average R2 and sum RMSE,0.818135,38.303054
4,average R2 and sum RMSE,0.818112,70.034564
9,average R2 and sum RMSE,0.815588,60.964448
45,average R2 and sum RMSE,0.810035,23.544837
333,average R2 and sum RMSE,0.808497,19.979906


In [23]:
df_predict['r2'].mean()

0.39398880500776434

In [24]:
df_predict['r2'].describe()

count    450.000000
mean       0.393989
std        0.361445
min       -2.832748
25%        0.228028
50%        0.482218
75%        0.625041
max        0.892148
Name: r2, dtype: float64