In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from collections import namedtuple
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.metrics import fbeta_score
from sklearn.metrics.scorer import make_scorer
from sklearn.metrics import f1_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns

In [9]:
Grid = namedtuple("Grid", ['model', 'param_grid'])

In [10]:
grids = [
    Grid(LogisticRegression,
         {'estimator__solver': ('liblinear',)}),
    Grid(BaggingClassifier,
        {'estimator__n_estimators': (10, 200, 400, 800),
         'estimator__max_samples': (0.2, 0.4, 0.8, 1.0),
         'estimator__max_features': (0.2, 0.4, 0.8, 1.0)}),
    Grid(RandomForestClassifier,
        {'estimator__max_depth': (75, 100, None),
         'estimator__max_features': ('auto', 'log2', None),
         'estimator__n_estimators': (10, 200, 400, 600, 800)}),
    Grid(GradientBoostingClassifier,
         {'estimator__max_depth': (3, 4, 5),
          'estimator__max_features': ('auto', 'log2', None),
          'estimator__n_estimators': (10, 100, 200, 400, 800)}),
    Grid(SVC,
    {"estimator__C": (4, 8, 12),
     "estimator__degree": (3, 4, 5)})
]

# Notes
- Once i recieve the data - offset the target column by 1 so one year predicts the next
- Drop all polygon data & assume we have sufficient features to predict the next year
- Each row represents the status of a given polygon & the incidents that year

In [11]:
def get_next_year(year, region, df):
    if year+1 not in set(df[YEAR_COL]):
        return float('nan')
    return df.loc[(df[YEAR_COL]==year+1) & (df[REGION_COL]==region), TARGET].values[0]

In [57]:
df = pd.read_csv(r"C:\Users\olive\Documents\GitHub\MontrealFireSafetyProject\data\all_data_clean.csv", index_col=0)

TARGET = 'incident'
YEAR_COL = 'Year'
REGION_COL = 'Poly_Key'

df[REGION_COL] = df[REGION_COL].astype('str')
df['Next Year'] = df['Year'].apply(lambda x: int(x+1))

left = df.copy()
right = df.copy()

left['idx'] = left.apply(lambda row: "{}_{}".format(row['Poly_Key'], row['Year']), axis=1)
right['idx'] = right.apply(lambda row: "{}_{}".format(row['Poly_Key'], row['Next Year']), axis=1)

left = left.set_index(['idx'], drop=True)
right = right.set_index(['idx'], drop=True)

tot = left.join(right, lsuffix='_nextyear', rsuffix='_thisyear', how='inner')
tot = tot[['Poly_Key_nextyear', 'Building_fire_nextyear', 'incident_nextyear', 
           *[x for x in tot.columns if x[-9:]=='_thisyear']]]

tot = tot.drop(['Next Year_thisyear'], axis=1)
tot = tot.set_index(['Poly_Key_thisyear', 'Year_thisyear'], drop=True)

df = tot.reset_index(drop=True).copy()
X, y = df.drop([TARGET + '_nextyear'], axis=1), df[TARGET + '_nextyear']
y = y.apply(lambda x: 1 if x > 0 else 0)

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [59]:
X_train.head()

Unnamed: 0,Poly_Key_nextyear,Building_fire_nextyear,CFSAUID_thisyear,Building_fire_thisyear,incident_thisyear,Desc_gr_clean_Alarmes-incendies_thisyear,Desc_gr_clean_Autres incendies_thisyear,Desc_gr_clean_Fausses alertes/annulations_thisyear,Desc_gr_clean_Nouveau_thisyear,Desc_gr_clean_Premier répondant_thisyear,...,gr_0_to_14_years_thisyear,gr_15_to_29_ years_thisyear,gr_30_to_64_years_thisyear,gr_65_years_+_thisyear,gr_married_commomlawpartner_thisyear,income_per_capita_thisyear,Income_0_to_35K_thisyear,Income_35K_to_70K_thisyear,Income_70K+_thisyear,population tax payers_thisyear
334698,POLYGON ((-73.80030381717036 45.49517336625862...,0.0,H9B,0.0,5.0,2,0,0,0,3.0,...,0.16,0.19,0.45,0.2,0.47,41396.325783,0.58,0.27,0.15,16600
44697,"POLYGON ((-73.75406397283518 45.4462415462429,...",0.0,H9S,0.0,1.0,0,0,0,0,0.0,...,0.13,0.17,0.48,0.2,0.46,57937.509624,0.46,0.3,0.24,18235
288102,POLYGON ((-73.67302075148918 45.53332074744709...,0.0,H4N,0.0,0.0,0,0,0,0,0.0,...,0.19,0.18,0.43,0.13,0.38,27812.305899,0.72,0.2,0.07,21615
196600,"POLYGON ((-73.5838727421382 45.49549595730367,...",0.0,H3H,0.0,21.0,8,1,0,0,11.0,...,0.06,0.42,0.38,0.14,0.33,38449.708056,0.7,0.17,0.13,14585
82265,POLYGON ((-73.88462916907493 45.46855329787981...,0.0,H9J,0.0,0.0,0,0,0,0,0.0,...,0.23,0.23,0.54,0.07,0.53,53589.65464,0.49,0.27,0.24,17350


In [60]:
X_train.columns

Index(['Poly_Key_nextyear', 'Building_fire_nextyear', 'CFSAUID_thisyear',
       'Building_fire_thisyear', 'incident_thisyear',
       'Desc_gr_clean_Alarmes-incendies_thisyear',
       'Desc_gr_clean_Autres incendies_thisyear',
       'Desc_gr_clean_Fausses alertes/annulations_thisyear',
       'Desc_gr_clean_Nouveau_thisyear',
       'Desc_gr_clean_Premier répondant_thisyear',
       'Desc_gr_clean_Sans incendie_thisyear', 'Num_units_total_thisyear',
       'Num_units_mean_thisyear', 'Num_units_median_thisyear',
       'population_thisyear', 'total_dwellings_thisyear',
       'gr_0_to_14_years_thisyear', 'gr_15_to_29_ years_thisyear',
       'gr_30_to_64_years_thisyear', 'gr_65_years_+_thisyear',
       'gr_married_commomlawpartner_thisyear', 'income_per_capita_thisyear',
       'Income_0_to_35K_thisyear', 'Income_35K_to_70K_thisyear',
       'Income_70K+_thisyear', 'population tax payers_thisyear'],
      dtype='object')

In [61]:
numerical_features = [
       'Building_fire_thisyear', 'incident_thisyear',
       'Desc_gr_clean_Alarmes-incendies_thisyear',
       'Desc_gr_clean_Autres incendies_thisyear',
       'Desc_gr_clean_Fausses alertes/annulations_thisyear',
       'Desc_gr_clean_Nouveau_thisyear',
       'Desc_gr_clean_Premier répondant_thisyear',
       'Desc_gr_clean_Sans incendie_thisyear', 'Num_units_total_thisyear',
       'Num_units_mean_thisyear', 'Num_units_median_thisyear',
       'population_thisyear', 'total_dwellings_thisyear',
       'gr_0_to_14_years_thisyear', 'gr_15_to_29_ years_thisyear',
       'gr_30_to_64_years_thisyear', 'gr_65_years_+_thisyear',
       'gr_married_commomlawpartner_thisyear', 'income_per_capita_thisyear',
       'Income_0_to_35K_thisyear', 'Income_35K_to_70K_thisyear',
       'Income_70K+_thisyear', 'population tax payers_thisyear'
]

categorical_features = []

In [62]:
categorical_pipeline = Pipeline(steps=[('one_hot_encoder', OneHotEncoder(sparse=False, 
                                                                         drop='first',
                                                                         handle_unknown='error'))])

In [63]:
numerical_pipeline = Pipeline(steps=[('min_max_scalar', MinMaxScaler())])

In [64]:
prep_pipeline = ColumnTransformer([
    ("cat", categorical_pipeline, categorical_features),
    ("num", numerical_pipeline, numerical_features)
])

In [65]:
full_pipeline = Pipeline(steps=[('prep', prep_pipeline),
                                #('feature_engineering', SelectFromModel(estimator=DecisionTreeClassifier(), threshold='0.5*mean')),
                                ('estimator', LogisticRegression())])

In [66]:
full_pipeline.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('prep',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  Pipeline(memory=None,
                                                           steps=[('one_hot_encoder',
                                                                   OneHotEncoder(categorical_features=None,
                                                                                 categories=None,
                                                                                 drop='first',
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='error',
                                    

In [67]:
y_pred = full_pipeline.predict(X_test)

In [68]:
f1_score(y_test, y_pred)

0.835403704265759

In [69]:
from sklearn.metrics import accuracy_score

In [70]:
len(y_test[y_test==1])/len(y_test)

0.6599613592453688

In [71]:
accuracy_score(y_test, y_pred)

0.7842841989620033

### Baseline: Predict next year there will be an incident if there was this year

In [73]:
y_b1 = X_test['incident_thisyear']
y_b1 = y_b1.apply(lambda x: 1 if x>0 else 0)

In [74]:
accuracy_score(y_test, y_b1)

0.7849092699928022

In [75]:
feature_importances = dict(zip(numerical_features, full_pipeline.steps[-1][1].coef_[0, :]))

In [76]:
feature_importances

{'Building_fire_thisyear': -2.1405831075482626,
 'incident_thisyear': 51.1871725206734,
 'Desc_gr_clean_Alarmes-incendies_thisyear': 7.759257406685564,
 'Desc_gr_clean_Autres incendies_thisyear': 6.854819467427543,
 'Desc_gr_clean_Fausses alertes/annulations_thisyear': 1.5644352912977417,
 'Desc_gr_clean_Nouveau_thisyear': 0.07957867796472994,
 'Desc_gr_clean_Premier répondant_thisyear': 43.14346229360581,
 'Desc_gr_clean_Sans incendie_thisyear': 20.29824970382592,
 'Num_units_total_thisyear': 7.523142433618715,
 'Num_units_mean_thisyear': 16.844833390698813,
 'Num_units_median_thisyear': 6.304380799560657,
 'population_thisyear': 1.401726798933922,
 'total_dwellings_thisyear': -1.264562770257709,
 'gr_0_to_14_years_thisyear': 0.22202197938524604,
 'gr_15_to_29_ years_thisyear': -0.6710356969834613,
 'gr_30_to_64_years_thisyear': 1.0797738929734615,
 'gr_65_years_+_thisyear': 0.2176049787444417,
 'gr_married_commomlawpartner_thisyear': -1.713122886555357,
 'income_per_capita_thisyear':