In [1]:
import yaml
import json
import pandas as pd

from transformers import *

from load_data import load_dataset
with open("config.yaml", 'r') as stream:
    config = yaml.safe_load(stream)
    stream.close()

In [2]:
config

{'package_name': 'mci_model',
 'training_data_file': 'train.csv',
 'test_data_file': 'test.csv',
 'pipeline_name': 'mci_model',
 'pipeline_save_file': 'mci_model_output_v',
 'duplicate_record_key': 'event_unique_id',
 'datetime_features': {'date': 'occurrencedate',
  'day_of_year': 'occurrencedayofyear',
  'month': 'occurrencemonth',
  'day_of_week': 'occurrencedayofweek',
  'day_of_month': 'occurrenceday',
  'hour': 'occurrencehour'},
 'features_na_not_allowed': ['Lat', 'Long', 'occurrencedate'],
 'NSA_features': ['Division', 'Hood_ID', 'Neighbourhood'],
 'engineered_features': {'season': 'Season',
  'ToD': 'ToDCrimeLevel',
  'holiday': 'Holiday',
  'weekday': 'Weekday',
  'pub': 'Pub_Id',
  'park': 'Park_Id',
  'police_station': 'PS_Id'},
 'targets': ['Lat', 'Long'],
 'train_features': ['occurrencedate',
  'occurrencehour',
  'premises_type',
  'occurrencemonth',
  'occurrencedayofweek',
  'MCI',
  'Neighbourhood',
  'occurrenceday',
  'occurrencedayofyear'],
 'inference_features_to_

In [3]:
data = load_dataset(file_name='../test.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../test.csv'

#### Date Time Imputer 

In [4]:
date_features = config.get('datetime_features')
date_features

{'date': 'occurrencedate',
 'day_of_year': 'occurrencedayofyear',
 'month': 'occurrencemonth',
 'day_of_week': 'occurrencedayofweek',
 'day_of_month': 'occurrenceday',
 'hour': 'occurrencehour'}

In [5]:
len(data)

30000

In [6]:
imputer = DateTimeImputer(date_features.get('date'))

print(
data[[
        date_features.get('day_of_year'),
        date_features.get('month'),
        date_features.get('day_of_week'),
        date_features.get('day_of_month')
        ]].iloc[10].to_list()
     )

test_object = imputer.fit_transform(data)

test_object[[
        date_features.get('day_of_year'),
        date_features.get('month'),
        date_features.get('day_of_week'),
        date_features.get('day_of_month')
        ]].iloc[10].to_list()

[14, 'January', 'Saturday  ', 14]


[14, 1, 5, 14]

In [27]:
data

Unnamed: 0,occurrencedate,occurrencehour,premises_type,occurrencemonth,occurrencedayofweek,MCI,Neighbourhood,occurrenceday,occurrencedayofyear,Pub_Id,Park_Id,PS_Id,Lat,Long
0,2014/03/02 05:00:00+00,8,House,March,Sunday,Assault,West Humber-Clairville,2,61,0,0,0,43.734013,-79.590332
1,2013/12/24 05:00:00+00,22,Commercial,December,Tuesday,Robbery,West Humber-Clairville,24,358,0,0,0,43.731834,-79.600701
2,2013/01/05 05:00:00+00,4,Commercial,January,Saturday,Assault,West Humber-Clairville,5,5,0,0,0,43.686423,-79.600794
3,2013/12/31 05:00:00+00,21,Commercial,December,Tuesday,Break and Enter,West Humber-Clairville,31,365,0,0,0,43.743642,-79.603876
4,2014/01/08 05:00:00+00,2,Commercial,January,Wednesday,Theft Over,West Humber-Clairville,8,8,0,0,0,43.678342,-79.584931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210987,2021/07/26 04:00:00+00,2,House,July,Monday,Auto Theft,Guildwood,26,207,0,0,0,43.744706,-79.203014
210988,2021/07/28 04:00:00+00,20,Outside,July,Wednesday,Auto Theft,Guildwood,28,209,0,0,0,43.744342,-79.197893
210989,2021/09/27 04:00:00+00,21,House,September,Monday,Auto Theft,Guildwood,27,270,0,0,0,43.756244,-79.178803
210990,2021/11/03 04:00:00+00,16,House,November,Wednesday,Auto Theft,Guildwood,3,307,0,0,0,43.744706,-79.203014


#### Feature Transformation

In [50]:
eng_features = config.get('engineered_features')
eng_features

{'season': 'Season',
 'ToD': 'ToDCrimeLevel',
 'holiday': 'Holiday',
 'weekday': 'Weekday',
 'pub': 'Pub_Id',
 'park': 'Park_Id',
 'police_station': 'PS_Id'}

Season Transformer

In [8]:
dt_features = date_features.copy()

In [19]:
transformer = SeasonTransformer(config.get('seasons'))
imputer = DateTimeImputer(date_features.get('date'))
data = imputer.transform(data)

val = 200
print(data[dt_features.get('day_of_year')].iat[val])

test_object = transformer.fit_transform(data)


print(test_object[eng_features.get('season')].iat[val])

161
Spring


Weekday/Weekend Transformer

In [11]:
imputer = DateTimeImputer(date_features.get('date'))
data = imputer.transform(data)

transformer = WeekdayTransformer()

day = 157

print(data[dt_features.get('day_of_week')].iat[day])
test_object = transformer.fit_transform(data)
print(test_object[eng_features.get('weekday')].iat[day])

1
weekday


ToD Transformer

In [51]:
config.get('levels')

{'high': [0, 12, 18, 21, 20, 22, 19, 23, 17, 15],
 'med': [16, 14, 1, 2, 13, 11, 10, 9, 3, 8],
 'low': [4, 7, 5, 6]}

In [33]:
transformer = ToDTransformer(config.get('levels'))
val = 114
# 0: 8, 'med'
# 63: 23, 'high'
# 152: 6, 'low'
print(data[dt_features.get('hour')].iat[val])
test_object = transformer.fit_transform(data)

test_object[eng_features.get('ToD')].iat[val]

7


'low'

In [44]:
test_object['occurrencehour'].unique()

array([5, 4], dtype=int64)

Holiday Transformer

In [36]:
idx = data[data[dt_features.get('day_of_year')] == 359].index
# data[data[dt_features.get('Holiday')

In [37]:
data[data.index.isin(idx)][dt_features.get('day_of_year')] == 359

662      True
996      True
2075     True
2156     True
2421     True
         ... 
29189    True
29363    True
29426    True
29901    True
29960    True
Name: occurrencedayofyear, Length: 77, dtype: bool

In [18]:
transformer = HolidayTransformer(config.get('holidays'))

test_object = transformer.fit_transform(data)

# Then
assert all(test_object[test_object['occurrencedayofyear'] == 350][eng_features.get('holiday')] == 'non-holiday')

Added Feature Transformer

In [3]:
targets = config.get('targets')

In [38]:
added_feature_names = [eng_features.get('pub'),
    eng_features.get('park'),
    eng_features.get('police_station')]

added_features = [json.load(open(name+".json"))
                            for name in added_feature_names]

In [39]:
transformer = NewFeatureTransformer(added_features, added_feature_names)
test_object = transformer.fit_transform(X=data, y= data[[targets[0], targets[1]]])

In [50]:
len(test_object)

30000

In [49]:
len(test_object.PS_Id.unique())

23

In [41]:
list(test_object[added_feature_names].iloc[0])

[162, 61, 18]

### Test prediction

In [12]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from typing import List
from pydantic import BaseModel

import transformers as t
added_feature_names = [eng_features.get('pub'),
    eng_features.get('park'),
    eng_features.get('police_station')]

added_features = [json.load(open(name+".json"))
                            for name in added_feature_names]

hyperparams = config.get('hyperparams')

mci_pipeline = Pipeline(
    [
        ("impute_date_time", t.DateTimeImputer(config['datetime_features']['date'])),
        ("add_weekday/weekend", t.WeekdayTransformer()),
        ("add_ToD_crime_level", t.ToDTransformer(levels=config['levels'])),
        ("add_seasons", t.SeasonTransformer(seasons=config['seasons'])),
        ("add_holidays", t.HolidayTransformer(holidays=config['holidays'])),
        ("add_new_features", t.NewFeatureTransformer(added_features, added_feature_names)),
        ("OHE", OneHotEncoder(sparse=False)),
        ("PCA", PCA(n_components=hyperparams['pca']['n_components'], svd_solver= str(hyperparams['pca']['svd_solver']))),
        ("linear_regression", LinearRegression(normalize=bool(hyperparams['estimator']['normalize']))),
    ]
)


In [13]:
train = load_dataset(file_name='../train.csv')

Index(['event_unique_id'], dtype='object')


In [16]:
mci_pipeline.fit(train[config.get('train_features')+config.get('inference_features_to_add')], 
                 train[config.get('targets')])



Pipeline(steps=[('impute_date_time',
                 DateTimeImputer(date_col='occurrencedate')),
                ('add_weekday/weekend', WeekdayTransformer()),
                ('add_ToD_crime_level',
                 ToDTransformer(levels={'high': [0, 12, 18, 21, 20, 22, 19, 23,
                                                 17, 15],
                                        'low': [4, 7, 5, 6],
                                        'med': [16, 14, 1, 2, 13, 11, 10, 9, 3,
                                                8]})),
                ('add_seasons',
                 SeasonTransformer(seasons={'fall': [265, 353, 353],
                                            'spring': [78, 170],
                                            'summer'...
                                                              -79.41790379999999,
                                                              -79.42569999999999,
                                                              -79.362570699999

In [17]:
import joblib
joblib.dump(mci_pipeline, 'trained_pipeline.pkl')

['trained_pipeline.pkl']

In [18]:
test = load_dataset(file_name='../test.csv')

Index(['event_unique_id'], dtype='object')


In [33]:
import joblib
loaded_pipeline = joblib.load('trained_pipeline.pkl')
loaded_pipeline.steps.pop(5);

In [34]:
len(config.get('train_features')+config.get('inference_features_to_add'))

12

In [36]:
preds = loaded_pipeline.predict(test[config.get('train_features')+config.get('inference_features_to_add')])

In [37]:
len(train), len(test), len(preds)

(210992, 30000, 30000)

In [38]:
preds[0]

array([ 43.7274248 , -79.37372841])

In [39]:
test.iloc[0]

occurrencedate         2017/02/28 05:00:00+00
occurrencehour                             11
premises_type                           House
occurrencemonth                      February
occurrencedayofweek                Tuesday   
MCI                           Break and Enter
Neighbourhood                  Highland Creek
occurrenceday                              28
occurrencedayofyear                        59
Pub_Id                                      0
Park_Id                                     0
PS_Id                                       0
Lat                                   43.7905
Long                               -79.181186
Name: 0, dtype: object