In [None]:
import pandas as pd
from datetime import datetime, date
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [None]:
class SFD_Set:
    def __init__(self, filename):
        self.filename = filename
        self.df = pd.read_csv(filename)
        self.df_train = None
        self.df_test = None
        self.y_train = None
        self.y_test = None

    def split_datetime(self):

        # Divide date into Years,Months,Days,Hours
        self.df["Datetime"]= self.df["Datetime"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p'))
        self.df["Year"] = self.df["Datetime"].apply(lambda x: int(x.year))
        self.df["Month"] = self.df["Datetime"].apply(lambda x: int(x.month))
        self.df["Day"] = self.df["Datetime"].apply(lambda x: int(x.day))
        self.df["Hour"] = self.df["Datetime"].apply(lambda x: int(x.hour))

    def start_year(self, year):
        self.df= self.df[self.df["Year"] >= year]

    def drop_null_values(self):
        self.df.dropna(inplace=True) # drop null values

    def sort_df(self):
        self.df = self.df.sort_values(by='Datetime') # Sort by Date

    def drop_redundant_cols(self):
        self.df.drop(["Address", "Type", "Report Location", "Incident Number", "Datetime"], axis=1, inplace=True)

    def train_test_split(self, year):
        # We want to predict the hourly call volume of given year
        self.df_train = self.df[self.df["Year"] < year]
        self.df_test = self.df[self.df["Year"] == year]
        self.y_train = self.df_train.pop("Hourly Call Volume")
        self.y_test = self.df_test.pop("Hourly Call Volume")

# Feature Engineering

In [None]:
def create_target_variable(df):
    """
    Creates the target variable: Hourly Call Volume (Number of Calls per hour)
    :param df: pd.Dataframe, containing the preprocessed data
    :return: df (with target variable)
    """
    y_m_d= df.apply(lambda x: (int(x["Year"]), int(x["Month"]),int(x["Day"]), int(x["Hour"])), axis=1)
    hourly_call_volume = df.groupby(by=["Year", "Month", "Day", "Hour"]).count()["Latitude"].to_dict()
    df["Hourly Call Volume"] = y_m_d.map(hourly_call_volume)
    return df

def add_daily_call_volume_feat(df):
    """
    Adds a new feature: Daily Call Volume (Number of calls per day)
    :param df: pd.Dataframe, containing the preprocessed data
    :return: df (with new feature)
    """
    m_d = df.apply(lambda x: (int(x["Year"]), int(x["Month"]),int(x["Day"])), axis=1)
    avg_daily_call_volumes = df.groupby(by=["Year", "Month", "Day"])["Hour"].count().to_dict() #.apply(lambda x: x/df["Year"].unique().size)
    df["Daily Call Volume"] = m_d.map(avg_daily_call_volumes)
    return df

def get_season(in_datetime):
    """
    Returns season of datetime object
    :param in_datetime: datetime.datetime
    :return: season (str): winter, spring, summer or fall
    """
    Y = 2000 # dummy leap year to allow input X-02-29 (leap day)
    seasons = [('winter', (date(Y,  1,  1),  date(Y,  3, 20))),
               ('spring', (date(Y,  3, 21),  date(Y,  6, 20))),
               ('summer', (date(Y,  6, 21),  date(Y,  9, 22))),
               ('fall', (date(Y,  9, 23),  date(Y, 12, 20))),
               ('winter', (date(Y, 12, 21),  date(Y, 12, 31)))]

    assert isinstance(in_datetime, datetime), "Not a datetime object!"
    in_datetime = in_datetime.replace(year=Y)
    return next(season for season, (start, end) in seasons
                if start <= in_datetime.date() <= end)

def create_features(df):
    """
    Creates several new features and returns an Easydict containing the train and test data
    :param df: pd.Dataframe, containing preprocessed data
    :return: df
    """
    df["Season"]= df["Datetime"].apply(lambda x: get_season(x))
    df["Weekday"]= df["Datetime"].apply(lambda x: datetime.weekday(x))
    df = add_daily_call_volume_feat(df)
    df = create_target_variable(df)
    return df

# Training

In [None]:
class SFD_Train:
    """
    class used for training the decision tree
    """
    def __init__(self, df_train, y_train, classifier):
        self.classifier = classifier
        self.df_train = df_train
        self.y_train = y_train
        self.pipe = None
        self.numerical_cols = df_train.select_dtypes('number').columns
        self.categorical_cols = pd.Index(np.setdiff1d(df_train.columns, self.numerical_cols))

    def create_pipe(self):
        numerical_pipe = Pipeline([
            ('scaler', StandardScaler())])

        categorical_pipe = Pipeline([(
            'encoder', OneHotEncoder(drop='first', handle_unknown='error'))])

        preprocessors = ColumnTransformer(transformers=[
            ('num', numerical_pipe, self.numerical_cols),
            ('cat', categorical_pipe, self.categorical_cols)
        ])

        self.pipe = Pipeline([('preprocessors', preprocessors), ('tree', tree)])

    def train(self):
        self.pipe.fit(self.df_train, self.y_train)

    def save_model(self, save_path):
        joblib.dump(self.pipe, save_path)

# Testing

In [None]:
class SFD_Test:
    """
    class used to test the trained decision tree
    """
    def __init__(self, a):
        self.df_train = a.df_train
        self.y_train = a.y_train
        self.df_test = a.df_test
        self.y_test = a.y_test

    def load_model(self, path):
        self.pipe = joblib.load(path)

    def eval_on_train(self):
        train_loss = mean_squared_error(self.y_train, self.pipe.predict(self.df_train))
        train_R2_score = self.pipe.score(self.df_train, self.y_train)
        print('Training MSE Loss: {} \n Test R2 score {}'.format(train_loss, train_R2_score))

    def eval_on_test(self):
        self.test_pred = self.pipe.predict(self.df_test)
        test_loss = mean_squared_error(self.y_test, self.test_pred)
        test_R2_score  = self.pipe.score(self.df_test, self.y_test)
        print('Test MSE Loss: {} \n Test R2 score {}'.format(test_loss, test_R2_score))

    def plot_results(self):
        df = pd.concat([self.df_test, self.y_test], axis=1) # useful to get Hourly Call Volume according to hours and days
        df.reset_index(inplace=True)
        y_test_2 = self.y_test.reset_index(drop=True)
        indices = []
        # Get Hourly Call Volume for 1st week of March'19
        for day in range(1,8):
            for hour in range(0,24):
                indices.append(df[(df["Month"]==3) & (df["Day"]==day) & (df["Hour"]==hour)]["Hourly Call Volume"].index[0])

        plt.plot(np.arange(7*24), y_test_2[indices])
        plt.plot(np.arange(7*24), self.test_pred[indices])
        plt.xlabel("Hour")
        plt.ylabel("Call Volume")
        plt.title("Hourly Call Volume 1st week of March'19")
        plt.legend(["True", "Prediction"])
        plt.savefig("Hourly Call Volume 1st week March'19.png")
        plt.show()

# Preprocessing

In [None]:
a = SFD_Set("C:/Users/phili/Downloads/Seattle_Real_Time_Fire_911_Calls.csv")
a.split_datetime()
a.drop_null_values()
a.start_year(2014)
a.df = create_features(a.df)
a.drop_redundant_cols()
a.train_test_split(2019)

# Training

In [None]:
tree = AdaBoostRegressor(
    DecisionTreeRegressor(max_depth=5), n_estimators=300, random_state=np.random.RandomState(1))
b = SFD_Train(a.df_train,a.y_train, tree)
b.create_pipe()
b.train()
b.save_model("./dec_tree_pipe.joblib")

# Testing

In [None]:
c = SFD_Test(a)
c.load_model("./dec_tree_pipe.joblib")
c.eval_on_train()
c.eval_on_test()
c.plot_results()