In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from collections import namedtuple
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.metrics import fbeta_score
from sklearn.metrics.scorer import make_scorer
from sklearn.metrics import f1_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns

In [9]:
Grid = namedtuple("Grid", ['model', 'param_grid'])

In [10]:
grids = [
    Grid(LogisticRegression,
         {'estimator__solver': ('liblinear',)}),
    Grid(BaggingClassifier,
        {'estimator__n_estimators': (10, 200, 400, 800),
         'estimator__max_samples': (0.2, 0.4, 0.8, 1.0),
         'estimator__max_features': (0.2, 0.4, 0.8, 1.0)}),
    Grid(RandomForestClassifier,
        {'estimator__max_depth': (75, 100, None),
         'estimator__max_features': ('auto', 'log2', None),
         'estimator__n_estimators': (10, 200, 400, 600, 800)}),
    Grid(GradientBoostingClassifier,
         {'estimator__max_depth': (3, 4, 5),
          'estimator__max_features': ('auto', 'log2', None),
          'estimator__n_estimators': (10, 100, 200, 400, 800)}),
    Grid(SVC,
    {"estimator__C": (4, 8, 12),
     "estimator__degree": (3, 4, 5)})
]

# Notes
- Once i recieve the data - offset the target column by 1 so one year predicts the next
- Drop all polygon data & assume we have sufficient features to predict the next year
- Each row represents the status of a given polygon & the incidents that year

In [11]:
def get_next_year(year, region, df):
    if year+1 not in set(df[YEAR_COL]):
        return float('nan')
    return df.loc[(df[YEAR_COL]==year+1) & (df[REGION_COL]==region), TARGET].values[0]

In [156]:
df = pd.read_csv(r"C:\Users\olive\Documents\GitHub\MontrealFireSafetyProject\data\all_data_clean.csv", index_col=0)

TARGET = 'incident'
YEAR_COL = 'Year'
REGION_COL = 'Poly_Key'

df[REGION_COL] = df[REGION_COL].astype('str')
df['Next Year'] = df['Year'].apply(lambda x: int(x+1))

left = df.copy()
right = df.copy()

left['idx'] = left.apply(lambda row: "{}_{}".format(row['Poly_Key'], row['Year']), axis=1)
right['idx'] = right.apply(lambda row: "{}_{}".format(row['Poly_Key'], row['Next Year']), axis=1)

left = left.set_index(['idx'], drop=True)
right = right.set_index(['idx'], drop=True)

tot = left.join(right, lsuffix='_x', rsuffix='_y', how='inner')
tot = tot[['Poly_Key_x', 'Building_fire_x', 'incident_x', *[x for x in tot.columns if x[-2:]=='_y']]]
tot = tot.drop(['Poly_Key_y', 'Building_fire_y', 'incident_y'], axis=1)

tot.columns = tot.columns.str.replace('_x', '')
tot.columns = tot.columns.str.replace('_y', '')
tot = tot.drop(['Next Year'], axis=1)
tot = tot.set_index(['Poly_Key', 'Year'], drop=True)

df = tot.reset_index(drop=True).copy()
X, y = df.drop([TARGET], axis=1), df[TARGET]
y = y.apply(lambda x: 1 if x > 0 else 0)

In [189]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [190]:
X_train.head()

Unnamed: 0,Building_fire,CFSAUID,Desc_gr_clean_Alarmes-incendies,Desc_gr_clean_Autres incendies,Desc_gr_clean_Fausses alertes/annulations,Desc_gr_clean_Nouveau,Desc_gr_clean_Premier répondant,Desc_gr_clean_Sans incendie,Num_units_total,Num_units_mean,...,gr_0_to_14ears,gr_15_to_29_ years,gr_30_to_64ears,gr_65ears_+,gr_married_commomlawpartner,income_per_capita,Income_0_to_35K,Income_35K_to_70K,Income_70K+,population tax payers
310975,0.0,H4R,2,0,0,0,4.0,0,13.0,2.166667,...,0.2,0.16,0.49,0.15,0.48,58188.244683,0.51,0.24,0.25,20925
65266,0.0,H8Z,0,0,0,0,0.0,0,0.0,0.0,...,0.17,0.2,0.48,0.15,0.48,39166.499287,0.56,0.29,0.15,11925
310292,0.0,H4R,0,2,0,0,3.0,1,24.0,3.428571,...,0.2,0.16,0.49,0.15,0.48,58188.244683,0.51,0.24,0.25,20925
159357,1.0,H2X,0,0,0,0,4.0,0,5.0,1.25,...,0.05,0.39,0.4,0.16,0.29,35957.843018,0.68,0.19,0.14,15110
29078,0.0,H1H,0,0,0,0,3.0,0,4.0,1.333333,...,0.17,0.17,0.46,0.2,0.38,26794.224829,0.73,0.22,0.05,27790


In [191]:
numerical_features = ['Desc_gr_clean_Alarmes-incendies',
       'Desc_gr_clean_Autres incendies',
       'Desc_gr_clean_Fausses alertes/annulations', 'Desc_gr_clean_Nouveau',
       'Desc_gr_clean_Premier répondant', 'Desc_gr_clean_Sans incendie',
       'Num_units_total', 'Num_units_mean', 'Num_units_median', 'population',
       'total_dwellings', 'gr_0_to_14ears', 'gr_15_to_29_ years',
       'gr_30_to_64ears', 'gr_65ears_+', 'gr_married_commomlawpartner',
       'income_per_capita', 'Income_0_to_35K', 'Income_35K_to_70K',
       'Income_70K+', 'population tax payers']

categorical_features = []

In [192]:
categorical_pipeline = Pipeline(steps=[('one_hot_encoder', OneHotEncoder(sparse=False, 
                                                                         drop='first',
                                                                         handle_unknown='error'))])

In [193]:
numerical_pipeline = Pipeline(steps=[('min_max_scalar', MinMaxScaler())])

In [194]:
prep_pipeline = ColumnTransformer([
    ("cat", categorical_pipeline, categorical_features),
    ("num", numerical_pipeline, numerical_features)
])

In [195]:
full_pipeline = Pipeline(steps=[('prep', prep_pipeline),
                                ('feature_engineering', SelectFromModel(estimator=DecisionTreeClassifier(), threshold='0.5*mean')),
                                ('estimator', LogisticRegression())])

In [196]:
full_pipeline.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('prep',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  Pipeline(memory=None,
                                                           steps=[('one_hot_encoder',
                                                                   OneHotEncoder(categorical_features=None,
                                                                                 categories=None,
                                                                                 drop='first',
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='error',
                                    

In [197]:
y_pred = full_pipeline.predict(X_test)

In [198]:
f1_score(y_test, y_pred)

0.8338442673206622

In [202]:
from sklearn.metrics import accuracy_score

In [201]:
len(y_test[y_test==1])/len(y_test)

0.6570254195552525

In [203]:
accuracy_score(y_test, y_pred)

0.781840739477971