In [1]:
#packages
import pandas as pd
import numpy as np
import warnings

from numpy import asarray
from numpy import mean
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from xgboost import XGBRFRegressor

In [2]:
# hide warnings
warnings.filterwarnings('ignore')

In [3]:
def prepare_data():
    df = read_data()
    df = format_columns(df)
    df = select_columns(df)

    #Ignore NaN rows
    df = df[df['minutes'].notna()]
    df = df[df['point_x'].notna()]
    df = df[df['point_y'].notna()]
    
    return df

In [4]:
def read_data():
    df = pd.read_csv("fire_incidents.csv")

    return df

In [5]:
def format_columns(df):
    df['alarm_format'] = pd.to_datetime(df['alarm_dttm'])
    df['arrival_format'] = pd.to_datetime(df['arrival_dttm'])
    df['arrival_time'] = (df['arrival_format'] - df['alarm_format'])
    df['minutes'] = (df.arrival_time.dt.seconds) / 60
    df['seconds'] = (df.arrival_time.dt.seconds)
    df['day_of_week'] = df['alarm_format'].dt.day_name().astype(str)
    df['dayflag'] = (df.alarm_format.dt.hour > 5) & (df.alarm_format.dt.hour <18)
    
    return df

In [6]:
def select_columns(df):
    cols = ['minutes','seconds','alarm_format','arrival_format','day_of_week','dayflag','zipcode',\
        'battalion', 'station_area', 'ems_units', 'number_of_alarms', 'primary_situation',\
        'action_taken_primary', 'property_use', 'neighborhood_district', 'supervisor_district',\
        'arrival_time', 'point_x', 'point_y']
    
    return df[cols]

In [7]:
df = prepare_data()

df.head()

Unnamed: 0,minutes,seconds,alarm_format,arrival_format,day_of_week,dayflag,zipcode,battalion,station_area,ems_units,number_of_alarms,primary_situation,action_taken_primary,property_use,neighborhood_district,supervisor_district,arrival_time,point_x,point_y
0,9.0,540.0,2008-01-04 18:06:00,2008-01-04 18:15:00,Friday,False,94110,B06,11,0,1,412 - Gas leak (natural gas or LPG),86 - Investigate,"962 - Residential street, road or residential dr",Bernal Heights,9.0,0 days 00:09:00,-122.418373,37.74209
1,6.0,360.0,2008-01-04 18:00:00,2008-01-04 18:06:00,Friday,False,94107,B10,37,0,1,552 - Police matter,76 - Provide water,"960 - Street, other",Potrero Hill,10.0,0 days 00:06:00,-122.39489,37.756291
2,3.0,180.0,2008-01-04 18:42:00,2008-01-04 18:45:00,Friday,False,94105,B03,1,0,1,"210 - Steam Rupture, steam, other",86 - Investigate,429 - Multifamily dwellings,South of Market,,0 days 00:03:00,-122.407468,37.78008
3,5.0,300.0,2008-01-04 19:03:00,2008-01-04 19:08:00,Friday,False,94102,B02,36,0,1,522 - Water or steam leak,64 - Shut down system,"400 - Residential, other",Hayes Valley,5.0,0 days 00:05:00,-122.426849,37.776126
4,7.0,420.0,2008-01-04 19:16:00,2008-01-04 19:23:00,Friday,False,94121,B07,14,0,1,"520 - Water problem, other","00 - Action taken, other","960 - Street, other",Outer Richmond,,0 days 00:07:00,-122.486394,37.774285


In [8]:
class ModelData:
    x = None
    y = None
    x_train = None
    x_test = None
    y_train = None
    y_test = None

    model = None
    target_to_predict = None
    predictors = None
    XGB = None

In [33]:
def configure_sets(md):
    md.target_to_predict = 'minutes'
    md.predictors = ['point_x', 'point_y']
    
    md.x = df[md.predictors].values
    md.y = df[md.target_to_predict].values

    #Split the data into training and testing set
    md.x_train, md.x_test, md.y_train, md.y_test = train_test_split(md.x, md.y, test_size=0.8, random_state=42)

In [34]:
def create_model(md):
    #RegModel = XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=500, objective='reg:linear', booster='gbtree')
    md.model = XGBRFRegressor(n_estimators=100, subsample=0.9, colsample_bynode=0.2)

    #Printing all the parameters of XGBoost
    print(md.model)
    
    #Creating the model on Training Data
    md.XGB = md.model.fit(md.x_train, md.y_train)

    #Measuring Goodness of fit in Training data
    #print('R2 Value:',metrics.r2_score(y_train, XGB.predict(x_train)))

    #Measuring accuracy on Testing Data
    #print('Accuracy',100- (np.mean(np.abs((y_test - prediction) / y_test)) * 100))

    #Plotting the feature importance for Top 10 most important columns 
    #%matplotlib inline
    #feature_importances = pd.Series(XGB.feature_importances_, index=Predictors)
    #feature_importances.nlargest(10).plot(kind='barh')

In [35]:
def predict_single_value(md):
    row = [-122.429504,37.783009]
    row = asarray([row])
    
    prediction = md.model.predict(row)
    
    print('Prediction: %f' % prediction[0])

In [36]:
def predict_using_training_set(md):
    prediction = md.XGB.predict(md.x_test)
    
    training_set_predictions = pd.DataFrame(data = md.x_test, columns = md.predictors)
    training_set_predictions[md.target_to_predict] = md.y_test
    training_set_predictions[('Predicted' + md.target_to_predict)] = prediction
    training_set_predictions.head()
    
    print(training_set_predictions)

In [37]:
model_data = ModelData()

configure_sets(model_data)
create_model(model_data)

XGBRFRegressor(base_score=None, booster=None, colsample_bylevel=None,
               colsample_bynode=0.2, colsample_bytree=None,
               enable_categorical=False, gamma=None, gpu_id=None,
               importance_type=None, interaction_constraints=None,
               max_delta_step=None, max_depth=None, min_child_weight=None,
               missing=nan, monotone_constraints=None, n_estimators=100,
               n_jobs=None, num_parallel_tree=None,
               objective='reg:squarederror', predictor=None, random_state=None,
               reg_alpha=None, scale_pos_weight=None, subsample=0.9,
               tree_method=None, validate_parameters=None, verbosity=None)


In [38]:
predict_single_value(model_data)

Prediction: 4.637574


In [39]:
predict_using_training_set(model_data)

           point_x    point_y  minutes  Predictedminutes
0      -122.434701  37.761095      0.0          4.832744
1      -122.429504  37.783009      4.0          4.637574
2      -122.486879  37.779993      3.0          4.994109
3      -122.431999  37.786447      3.0          4.637574
4      -122.401935  37.787941      5.0          4.709226
...            ...        ...      ...               ...
452887 -122.445463  37.771122      4.0          4.986398
452888 -122.416204  37.732996      8.0          5.768368
452889 -122.461028  37.800052      3.0          6.111874
452890 -122.422544  37.798959      7.0          4.618063
452891 -122.403299  37.756381      6.0          5.100289

[452892 rows x 4 columns]
