In [9]:
import yaml
import json
import pandas as pd

from manual_tests.transformers import *
from manual_tests.load_data import load_dataset
with open("config.yaml", 'r') as stream:
    config = yaml.safe_load(stream)
    stream.close()

In [10]:
data = load_dataset(file_name='Major_Crime_Indicators.csv', training = True)

  data = pd.read_csv(file_name)


Dropping NSA in:  Division
NSA index:  Int64Index([ 71282,  81184,  86231, 140139, 141629, 143279, 145559, 193028,
            193264, 199491,
            ...
            281672, 281674, 281675, 281678, 281684, 281685, 281686, 281687,
            281688, 281689],
           dtype='int64', length=829)
Dropping NSA in:  Hood_ID
NSA index:  Int64Index([277017, 277024, 277030, 277089, 277096, 277099, 277105, 277109,
            277113, 277115,
            ...
            281482, 281483, 281521, 281525, 281528, 281534, 281544, 281555,
            281583, 281673],
           dtype='int64', length=318)
Dropping NSA in:  Neighbourhood
NSA index:  Int64Index([], dtype='int64')
dropping invalid complete


In [11]:
dt_features = config.get('datetime_features')
eng_features = config.get('engineered_features')
inference_features_to_add = config.get('inference_features_to_add')
features = config.get('train_features')
targets = config.get('targets')

eng_features = config.get('engineered_features')
added_feature_names = [eng_features.get('pub'), eng_features.get('park'), eng_features.get('police_station')]

added_features = [json.load(open(f"manual_tests/{name}.json"))
                            for name in added_feature_names]

hyperparams = config.get('hyperparams')

In [12]:
sample = data.sample(30000)
sampled_idx = sample.index
data.drop(axis=0, index=sampled_idx, inplace=True)
assert len(data[data.index.isin(sampled_idx)]) == 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(axis=0, index=sampled_idx, inplace=True)


In [14]:
len(sample), len(data)

(30000, 210992)

In [15]:
feature_adder = NewFeatureTransformer(added_features, added_feature_names)
sample = feature_adder.fit_transform(X=sample, y=sample[targets])

In [16]:
len(sample)

30000

In [17]:
len(data)

210992

In [18]:
data.to_csv('train.csv', index=False)
sample.to_csv('test.csv', index=False)

In [10]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

pipeline = Pipeline(
    [
        ("impute_date_time", DateTimeImputer(dt_features.get('date'))),
        ("add_weekday/weekend", WeekdayTransformer()),
        ("add_ToD_crime_level", ToDTransformer(levels=config.get('levels'))),
        ("add_seasons", SeasonTransformer(seasons=config.get('seasons'))),
        ("add_holidays", HolidayTransformer(holidays=config.get('holidays'))),
        ("add_new_features", NewFeatureTransformer(added_features, added_feature_names)),
        ("OHE", OneHotEncoder(sparse=False)),
        ("PCA", PCA(n_components=hyperparams.get('pca_n_components'), svd_solver=hyperparams.get('pca_svd_solver'))),
        ("linear_regression", LinearRegression(normalize=hyperparams.get('lr_normalize'))),
    ]
)

In [11]:
pipeline.steps

[('impute_date_time', DateTimeImputer(date_col='occurrencedate')),
 ('add_weekday/weekend', WeekdayTransformer()),
 ('add_ToD_crime_level',
  ToDTransformer(levels={'high': [0, 12, 18, 21, 20, 22, 19, 23, 17, 15],
                         'low': [4, 7, 5, 6],
                         'med': [16, 14, 1, 2, 13, 11, 10, 9, 3, 8]})),
 ('add_seasons',
  SeasonTransformer(seasons={'fall': [265, 353], 'spring': [78, 170],
                             'summer': [171, 264], 'winter': [354, 77]})),
 ('add_holidays',
  HolidayTransformer(holidays=[1, 46, 92, 144, 182, 249, 284, 359, 360])),
 ('add_new_features',
  NewFeatureTransformer(feature_names=['Pub_Id', 'Park_Id', 'PS_Id'],
                        features=[{'Pub_Id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                              11, 12, 13, 14, 15, 16, 17, 18, 19,
                                              20, 21, 22, 23, 24, 25, 26, 27, 28,
                                              29, ...],
                     

In [13]:
data.columns, sample.columns

(Index(['X', 'Y', 'Index_', 'event_unique_id', 'Division', 'occurrencedate',
        'reporteddate', 'location_type', 'premises_type', 'ucr_code', 'ucr_ext',
        'offence', 'reportedyear', 'reportedmonth', 'reportedday',
        'reporteddayofyear', 'reporteddayofweek', 'reportedhour',
        'occurrenceyear', 'occurrencemonth', 'occurrenceday',
        'occurrencedayofyear', 'occurrencedayofweek', 'occurrencehour', 'MCI',
        'Hood_ID', 'Neighbourhood', 'Long', 'Lat', 'ObjectId', 'Pub_Id',
        'Park_Id', 'PS_Id'],
       dtype='object'),
 Index(['X', 'Y', 'Index_', 'event_unique_id', 'Division', 'occurrencedate',
        'reporteddate', 'location_type', 'premises_type', 'ucr_code', 'ucr_ext',
        'offence', 'reportedyear', 'reportedmonth', 'reportedday',
        'reporteddayofyear', 'reporteddayofweek', 'reportedhour',
        'occurrenceyear', 'occurrencemonth', 'occurrenceday',
        'occurrencedayofyear', 'occurrencedayofweek', 'occurrencehour', 'MCI',
        'H

In [12]:
features + added_feature_names

['occurrencedate',
 'occurrencehour',
 'premises_type',
 'occurrencemonth',
 'occurrencedayofweek',
 'MCI',
 'Neighbourhood',
 'occurrenceday',
 'occurrencedayofyear',
 'Pub_Id',
 'Park_Id',
 'PS_Id']

In [14]:
import joblib
trained_pipeline = pipeline.fit(X=data[features+added_feature_names], y=data[targets])
joblib.dump(trained_pipeline, 'train_pipeline')



In [17]:
import copy
inference_pipeline = copy.deepcopy(trained_pipeline)

In [22]:
# inference_pipeline.steps
# inference_pipeline.steps.pop(5)

[('impute_date_time', DateTimeImputer(date_col='occurrencedate')),
 ('add_weekday/weekend', WeekdayTransformer()),
 ('add_ToD_crime_level',
  ToDTransformer(levels={'high': [0, 12, 18, 21, 20, 22, 19, 23, 17, 15],
                         'low': [4, 7, 5, 6],
                         'med': [16, 14, 1, 2, 13, 11, 10, 9, 3, 8]})),
 ('add_seasons',
  SeasonTransformer(seasons={'fall': [265, 353], 'spring': [78, 170],
                             'summer': [171, 264], 'winter': [354, 77]})),
 ('add_holidays',
  HolidayTransformer(holidays=[1, 46, 92, 144, 182, 249, 284, 359, 360])),
 ('OHE', OneHotEncoder(sparse=False)),
 ('PCA', PCA(n_components=0.95, svd_solver='full')),
 ('linear_regression', LinearRegression(normalize=False))]

In [28]:
# inference_pipeline.score(sample[features+added_feature_names])

In [30]:
preds = inference_pipeline.predict(sample[features+inference_features_to_add])

In [51]:
from sklearn.metrics import mean_squared_error
mean_squared_error(sample[targets].values*111000, preds*111000)**0.5

596.0503640163066

In [44]:
print((min(preds[:, 0]), max(preds[:, 0]), min(preds[:, 1]), max(preds[:, 1])))


(43.58628367393166, 43.83204760291888, -79.61490377957769, -79.138369358389)


#### Date Time Imputer 

In [11]:
imputer = DateTimeImputer(date_features.get('date'))

print(
data[[
        date_features.get('day_of_year'),
        date_features.get('month'),
        date_features.get('day_of_week'),
        date_features.get('day_of_month'),
        date_features.get('hour')
        ]].iloc[10].to_list()
     )

test_object = imputer.fit_transform(data)

test_object[[
        date_features.get('day_of_year'),
        date_features.get('month'),
        date_features.get('day_of_week'),
        date_features.get('day_of_month'),
        date_features.get('hour')
        ]].iloc[10].to_list()

[12, 'January', 'Sunday    ', 12, 10]


[12, 1, 6, 12, 5]

#### Feature Transformation

Season Transformer

In [12]:
transformer = SeasonTransformer(config.get('seasons'))
print(data[dt_features.get('day_of_year')].iat[22])

test_object = transformer.fit_transform(data)


print(test_object[eng_features.get('season')].iat[22])

17
Winter


In [13]:
transformer = WeekdayTransformer()

print(data[dt_features.get('day_of_week')].iat[410].strip())
test_object = transformer.fit_transform(data)
print(test_object[eng_features.get('weekday')].iat[410])

Tuesday
weekday


ToD Transformer

In [14]:
transformer = ToDTransformer(config.get('levels'))

# 0: 8, 'med'
# 63: 23, 'high'
# 152: 6, 'low'
print(data[dt_features.get('hour')].iat[152])
test_object = transformer.fit_transform(data)

test_object[eng_features.get('ToD')].iat[152]

6


'low'

Holiday Transformer

In [15]:
transformer = HolidayTransformer(config.get('holidays'))

id = data[data[dt_features.get('day_of_year')] == 359].iloc[0].event_unique_id
test_object = transformer.fit_transform(data)

# Then
print(test_object[test_object['event_unique_id'] == id][eng_features.get('holiday')] == 'holiday')

1301    True
1324    True
1406    True
1408    True
1409    True
1699    True
Name: Holiday, dtype: bool


Added Feature Transformer

In [18]:
added_feature_names = [eng_features.get('pub'),
    eng_features.get('park'),
    eng_features.get('police_station')]

added_features = [json.load(open(f'manual_tests/{name}.json'))
                            for name in added_feature_names]

In [17]:
transformer = NewFeatureTransformer(added_features, added_feature_names)
test_object = transformer.transform(X=data[:10000], y= data[[targets[0], targets[1]]][:10000])

NameError: name 'added_features' is not defined