In [1]:
#packages
import pandas as pd
import numpy as np
import warnings
import sqlite3

from numpy import asarray
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
from xgboost import XGBRFRegressor

In [2]:

# hide warnings
warnings.filterwarnings('ignore')

In [3]:
def create_connection(path):
    connection = None 
    connection = sqlite3.connect(path)
    connection.text_factory = str

    return connection

In [4]:
def execute_query(connection, query):
    cursor = connection.cursor()
    if query == "":
        return "Query Blank"
    else:
        cursor.execute(query)
        connection.commit()
        return "Query executed successfully"

In [5]:

def create_table(connection, tableName, columns):
    sql = F"DROP TABLE IF EXISTS {tableName}; "
    execute_query(connection, sql)
    sql = F"CREATE TABLE {tableName} ({columns}); "
    return execute_query(connection, sql)


In [6]:

def insert_into_table(connection, table_name, columns, records):
    values = ""
    for record in records:
        record = record.replace("'", "")
        if len(values) > 0:
            values+=","
        if str.isnumeric(record):
            values+= str(record)
        else:
            values +=F"'{record}'"

    sql= F"INSERT INTO {table_name} ({columns}) VALUES ({values}); "
    return execute_query(connection, sql)


In [7]:
import csv

def is_successful( result):
    return "success" in result
    
def import_file(connection, file, table_name, columns):

    with open(file, encoding="utf8") as file:

        lines = csv.reader(file, delimiter=',')
        for line in lines:
                            
            result = insert_into_table(connection, table_name, columns, line)
            if not is_successful(result):
                return result

    return result

In [8]:
con = create_connection('C:\\Data\\fire_data_v4.db\\fire_data_v4.db')


In [9]:
create_table(con, "Truck_Engine_Fire_Station", "Station text, Type text, Unit text, FireStation text, Facility_ID text")


'Query executed successfully'

In [10]:
import_file(con, "C:\\repos\\gatech\\datavisualanalytics\\classproject\\firstrespondersites\\jyEDA\\clean\\truck_engine_fire_station.csv", "truck_engine_fire_station", "Station, Type, Unit, FireStation, Facility_ID")


'Query executed successfully'

In [20]:
def read_data(cpm):
    cur = con.cursor()

    df = pd.read_sql("select floating_catchment_output.[index] " \
        ", floating_catchment_output.zone_idx " \
        ", floating_catchment_output.accessibility_score " \
        ", floating_catchment_output.scenario_name " \
        ", calls_for_service.on_scene_dttm " \
        ", calls_for_service.response_dttm   " \
        ", fire_stations.facility_id " \
        ", category_mappings.[index] as primary_situation_index " \
        ", calls_for_service.case_location " \
        "from floating_catchment_output " \
        "inner join zone_idx_to_incident on  " \
        "zone_idx_to_incident.zone_idx = floating_catchment_output.zone_idx " \
        "inner join calls_for_service on calls_for_service.incident_number = zone_idx_to_incident.incident_number " \
        "left join fire_incidents AS fi on calls_for_service.incident_number = fi.incident_number " \
        "left join category_mappings on fi.primary_situation = category_mappings.primary_situation " \
        "left join Truck_Engine_Fire_Station on calls_for_service.unit_id = Truck_Engine_Fire_Station.Unit " \
        "left join fire_stations on fire_stations.facility_id = Truck_Engine_Fire_Station.Facility_ID " \
        "where calls_for_service.unit_type in ('TRUCK', 'ENGINE') " \
        "and scenario_name = 'baseline'",con = con) 
    
    return df

In [21]:
df = read_data(con)


In [22]:
 len(df)

137437

In [24]:
def format_columns(df):
    df['on_scene_dttm'] = pd.to_datetime(df['on_scene_dttm'])
    df['response_dttm'] = pd.to_datetime(df['response_dttm'])
    df['arrival_time'] = (df['on_scene_dttm'] - df['response_dttm'])
    df['minutes'] = (df.arrival_time.dt.seconds) / 60
    df['seconds'] = (df.arrival_time.dt.seconds)
    df['day_of_week'] = df['response_dttm'].dt.dayofweek
    df['dayflag'] = (df.response_dttm.dt.hour > 5) & (df.response_dttm.dt.hour < 18)
    
    point = df['case_location'].str.split(' ', n = 2, expand = True)
    point_x = point[1].str.split('(', n = 1, expand = True)
    point_y = point[2].str.split(')', n = 1, expand = True)
    
    df['point_x'] = point_x[1]
    df['point_y'] = point_y[0]
    
    return df

In [25]:
df = format_columns(df)

In [26]:
def select_columns(df):
    cols = ['minutes','seconds','response_dttm','on_scene_dttm','day_of_week','dayflag',\
        'zone_idx', 'accessibility_score', 'facility_id', 'primary_situation_index', \
        'arrival_time', 'point_x', 'point_y']
    
    return df[cols]

In [27]:
def prepare_data(df):
    
    df["facility_id"] = pd.to_numeric(df["facility_id"], errors='coerce')
    df["primary_situation_index"] = pd.to_numeric(df["primary_situation_index"], errors='coerce')
    
    #Ignore NaN rows
    df = df[df['minutes'].notna()]
    df = df[df['point_x'].notna()]
    df = df[df['point_y'].notna()]
    df = df[df['facility_id'].notna()]
    df = df[df['primary_situation_index'].notna()]
    
    return df

In [28]:
df = select_columns(df)
df = prepare_data(df)
df.head()

Unnamed: 0,minutes,seconds,response_dttm,on_scene_dttm,day_of_week,dayflag,zone_idx,accessibility_score,facility_id,primary_situation_index,arrival_time,point_x,point_y
4,1.75,105.0,2019-01-06 00:52:49,2019-01-06 00:54:34,6.0,False,8,0.000803,716.0,148.0,0 days 00:01:45,-122.507197717067,37.779525347186
5,4.116667,247.0,2019-01-06 00:53:23,2019-01-06 00:57:30,6.0,False,8,0.000803,696.0,148.0,0 days 00:04:07,-122.507197717067,37.779525347186
7,6.816667,409.0,2019-01-14 12:25:54,2019-01-14 12:32:43,0.0,True,8,0.000803,700.0,81.0,0 days 00:06:49,-122.513648358636,37.77848510937
10,2.966667,178.0,2019-01-14 12:41:12,2019-01-14 12:44:10,0.0,True,8,0.000803,698.0,81.0,0 days 00:02:58,-122.513648358636,37.77848510937
11,4.8,288.0,2019-01-14 12:24:23,2019-01-14 12:29:11,0.0,True,8,0.000803,706.0,81.0,0 days 00:04:48,-122.513648358636,37.77848510937


In [29]:
def configure_sets(md):
    md.target_to_predict = 'minutes'
    md.predictors = ['day_of_week','dayflag', 'zone_idx', 'accessibility_score', 'facility_id', 'primary_situation_index', 'point_x', 'point_y']
    
    md.x = df[md.predictors].values
    md.y = df[md.target_to_predict].values

    #Split the data into training and testing set
    md.x_train, md.x_test, md.y_train, md.y_test = train_test_split(md.x, md.y, test_size=0.8, random_state=42)


In [30]:
def create_model(md):
    #RegModel = XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=500, objective='reg:linear', booster='gbtree')
    md.model = XGBRFRegressor(n_estimators=4000, subsample=0.9, colsample_bynode=0.2)

    #Printing all the parameters of XGBoost
    print(md.model)
    
    #Creating the model on Training Data
    md.XGB = md.model.fit(md.x_train, md.y_train)
    
    prediction = md.XGB.predict(md.x_test)

    #Measuring Goodness of fit in Training data
    print('R2 Value:', metrics.r2_score(md.y_train, md.XGB.predict(md.x_train)))

    #Measuring accuracy on Testing Data
    print('Accuracy', 100 - (np.mean(np.abs((md.y_test - prediction) / md.y_test)) * 100))

    #Plotting the feature importance for Top 10 most important columns 
    %matplotlib inline
    feature_importances = pd.Series(md.XGB.feature_importances_, index = md.predictors)
    feature_importances.nlargest(10).plot(kind = 'barh')

In [31]:
def cross_validate_model(md):
    # define the model evaluation procedure
    cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
    # evaluate the model and collect the scores
    n_scores = cross_val_score(md.model, md.x, md.y, scoring = 'neg_mean_absolute_error', cv = cv, n_jobs = -1)
    # report performance
    print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [32]:
def predict_single_value(md):
    row = [-122.429504, 37.783009]
    row = asarray([row])
    
    prediction = md.model.predict(row)
    
    print('Prediction: %f' % prediction[0])

In [33]:
def predict_using_training_set(md):
    prediction = md.XGB.predict(md.x_test)
    
    training_set_predictions = pd.DataFrame(data = md.x_test, columns = md.predictors)
    training_set_predictions[md.target_to_predict] = md.y_test
    training_set_predictions[('Predicted' + md.target_to_predict)] = prediction
    training_set_predictions.head()
    
    print(training_set_predictions)

In [34]:
configure_sets(df)

In [35]:

create_model(df)
#cross_validate_model(df)

XGBRFRegressor(base_score=None, booster=None, colsample_bylevel=None,
               colsample_bynode=0.2, colsample_bytree=None,
               enable_categorical=False, gamma=None, gpu_id=None,
               importance_type=None, interaction_constraints=None,
               max_delta_step=None, max_depth=None, min_child_weight=None,
               missing=nan, monotone_constraints=None, n_estimators=4000,
               n_jobs=None, num_parallel_tree=None,
               objective='reg:squarederror', predictor=None, random_state=None,
               reg_alpha=None, scale_pos_weight=None, subsample=0.9,
               tree_method=None, validate_parameters=None, verbosity=None)
