In [114]:
#packages
import pandas as pd
import numpy as np
import warnings
import sqlite3

from numpy import asarray
from numpy import mean
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from xgboost import XGBRFRegressor

In [115]:
# hide warnings
warnings.filterwarnings('ignore')

In [116]:
def prepare_data():
    df = read_data()
    df = format_columns(df)
    df = select_columns(df)

    #Ignore NaN rows
    df = df[df['minutes'].notna()]
    #df = df[df['point_x'].notna()]
    #df = df[df['point_y'].notna()]
    
    return df

In [117]:
def read_data():
    con = sqlite3.connect("fire_data_v3.db")
    cur = con.cursor()

    df = pd.read_sql('select * from fire_incidents',con = con)
    
    return df

In [118]:
def format_columns(df):
    df['alarm_format'] = pd.to_datetime(df['alarm_dttm'])
    df['arrival_format'] = pd.to_datetime(df['arrival_dttm'])
    df['arrival_time'] = (df['arrival_format'] - df['alarm_format'])
    df['minutes'] = (df.arrival_time.dt.seconds) / 60
    df['seconds'] = (df.arrival_time.dt.seconds)
    df['day_of_week'] = df['alarm_format'].dt.day_name().astype(str)
    df['dayflag'] = (df.alarm_format.dt.hour > 5) & (df.alarm_format.dt.hour < 18)
    
    point = df['point'].str.split(' ', n = 2, expand = True)
    point_x = point[1].str.split('(', n = 1, expand = True)
    point_y = point[2].str.split(')', n = 1, expand = True)
    
    df['point_x'] = point_x[1]
    df['point_y'] = point_y[0]
    
    return df

In [119]:
def select_columns(df):
    cols = ['minutes','seconds','alarm_format','arrival_format','day_of_week','dayflag','zipcode',\
        'battalion', 'station_area', 'ems_units', 'number_of_alarms', 'primary_situation',\
        'action_taken_primary', 'property_use', 'neighborhood_district', 'supervisor_district',\
        'arrival_time', 'point_x', 'point_y']
    
    return df[cols]

In [121]:
df = prepare_data()

df.head()

Unnamed: 0,minutes,seconds,alarm_format,arrival_format,day_of_week,dayflag,zipcode,battalion,station_area,ems_units,number_of_alarms,primary_situation,action_taken_primary,property_use,neighborhood_district,supervisor_district,arrival_time,point_x,point_y
0,8.7,522.0,2008-04-01 18:06:37,2008-04-01 18:15:19,Tuesday,False,94110,B06,11,0,1,412 - Gas leak (natural gas or LPG),86 - Investigate,"962 - Residential street, road or residential dr",Bernal Heights,9.0,0 days 00:08:42,-122.41837339,37.74208979
1,5.633333,338.0,2008-04-01 18:00:52,2008-04-01 18:06:30,Tuesday,False,94107,B10,37,0,1,552 - Police matter,76 - Provide water,"960 - Street, other",Potrero Hill,10.0,0 days 00:05:38,-122.39489,37.756291
2,3.283333,197.0,2008-04-01 18:42:06,2008-04-01 18:45:23,Tuesday,False,94105,B03,1,0,1,"210 - Steam Rupture, steam, other",86 - Investigate,429 - Multifamily dwellings,South of Market,,0 days 00:03:17,-122.407468,37.78008
3,4.783333,287.0,2008-04-01 19:03:52,2008-04-01 19:08:39,Tuesday,False,94102,B02,36,0,1,522 - Water or steam leak,64 - Shut down system,"400 - Residential, other",Hayes Valley,5.0,0 days 00:04:47,-122.42684908,37.77612642
4,7.6,456.0,2008-04-01 19:16:12,2008-04-01 19:23:48,Tuesday,False,94121,B07,14,0,1,"520 - Water problem, other","00 - Action taken, other","960 - Street, other",Outer Richmond,,0 days 00:07:36,-122.4863941,37.77428492


In [122]:
class ModelData:
    x = None
    y = None
    x_train = None
    x_test = None
    y_train = None
    y_test = None

    model = None
    target_to_predict = None
    predictors = None
    XGB = None

In [123]:
def configure_sets(md):
    md.target_to_predict = 'minutes'
    md.predictors = ['point_x', 'point_y']
    
    md.x = df[md.predictors].values
    md.y = df[md.target_to_predict].values

    #Split the data into training and testing set
    md.x_train, md.x_test, md.y_train, md.y_test = train_test_split(md.x, md.y, test_size=0.8, random_state=42)

In [124]:
def create_model(md):
    #RegModel = XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=500, objective='reg:linear', booster='gbtree')
    md.model = XGBRFRegressor(n_estimators=100, subsample=0.9, colsample_bynode=0.2)

    #Printing all the parameters of XGBoost
    print(md.model)
    
    #Creating the model on Training Data
    md.XGB = md.model.fit(md.x_train, md.y_train)

    #Measuring Goodness of fit in Training data
    #print('R2 Value:',metrics.r2_score(y_train, XGB.predict(x_train)))

    #Measuring accuracy on Testing Data
    #print('Accuracy',100- (np.mean(np.abs((y_test - prediction) / y_test)) * 100))

    #Plotting the feature importance for Top 10 most important columns 
    #%matplotlib inline
    #feature_importances = pd.Series(XGB.feature_importances_, index=Predictors)
    #feature_importances.nlargest(10).plot(kind='barh')

In [125]:
def predict_single_value(md):
    row = [-122.429504,37.783009]
    row = asarray([row])
    
    prediction = md.model.predict(row)
    
    print('Prediction: %f' % prediction[0])

In [126]:
def predict_using_training_set(md):
    prediction = md.XGB.predict(md.x_test)
    
    training_set_predictions = pd.DataFrame(data = md.x_test, columns = md.predictors)
    training_set_predictions[md.target_to_predict] = md.y_test
    training_set_predictions[('Predicted' + md.target_to_predict)] = prediction
    training_set_predictions.head()
    
    print(training_set_predictions)

In [127]:
model_data = ModelData()

configure_sets(model_data)
create_model(model_data)

XGBRFRegressor(base_score=None, booster=None, colsample_bylevel=None,
               colsample_bynode=0.2, colsample_bytree=None,
               enable_categorical=False, gamma=None, gpu_id=None,
               importance_type=None, interaction_constraints=None,
               max_delta_step=None, max_depth=None, min_child_weight=None,
               missing=nan, monotone_constraints=None, n_estimators=100,
               n_jobs=None, num_parallel_tree=None,
               objective='reg:squarederror', predictor=None, random_state=None,
               reg_alpha=None, scale_pos_weight=None, subsample=0.9,
               tree_method=None, validate_parameters=None, verbosity=None)


In [128]:
predict_single_value(model_data)

Prediction: 4.658382


In [129]:
predict_using_training_set(model_data)

            point_x    point_y   minutes  Predictedminutes
0       -122.465494  37.782889  3.166667          5.634876
1       -122.468669  37.734163  4.600000          5.955068
2        -122.38416  37.616901  4.666667          5.500071
3       -122.391185  37.733141  6.400000          5.565484
4       -122.435376   37.77148  3.050000          4.774275
...             ...        ...       ...               ...
452959  -122.384996  37.720286  5.183333          5.657396
452960  -122.405559  37.731561  0.000000          5.948973
452961  -122.445347  37.770208  4.166667          4.864791
452962  -122.431009  37.775927  5.016667          4.685983
452963  -122.489627  37.780865  4.083333          5.203097

[452964 rows x 4 columns]
