# Business and data understanding

## Purpose
This notebook contains the data preparation phase for the .

## Tasks
- [ ] Create a baseline model.
- [ ] Submit the test result.

# Setup

## Library import

In [2]:
from datetime import datetime 
import json
import os
from pathlib import Path

from feature_engine.encoding import CountFrequencyEncoder, RareLabelEncoder, OneHotEncoder
import folium
import humps
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, log_loss
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as ply

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

# Options for pandas and plotly
pd.options.display.max_columns = None
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 200
plotly.offline.init_notebook_mode(connected=True)

## Local library import

In [3]:
if Path.cwd().name == 'notebooks':
    os.chdir('../')

from src.utils.data_describe import serie_nulos, cardinalidade
from src.data.data_preprocessing import (
    TransformCordinates, address_split, create_date_based_columns, snake_case_columns,
    create_simplified_address_column, one_hot_encoding_target
)

if Path.cwd().name == 'sf_crime':
    os.chdir('./notebooks/')

## Parameter definition
We set all relevant parameters for our notebook. By convention, parameters are uppercase, while all the 
other variables follow Python's guidelines.

In [4]:
RAW_DATA = '../data/raw/' 
EXTERNAL_DATA = '../data/external/' 
INTERIM_DATA = '../data/interim/' 
PROCESSED_DATA = '../data/processed/'
REFERENCES = '../references/'
RANDOM_STATE = 42

lst_columns = [
        'ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
        'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING',
        'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON',
        'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY',
        'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY',
        'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS'
    ]

## Data import
We retrieve all the required data for the analysis.

In [4]:
try:
    X_train = pd.read_parquet(INTERIM_DATA + 'X_train_transformed.pqt')
    X_validation = pd.read_parquet(INTERIM_DATA + 'X_validation_transformed.pqt')
    X_kaggle_test = pd.read_parquet(INTERIM_DATA + 'X_kaggle_test_transformed_transformed.pqt')
    y_train_ohe = pd.read_parquet(INTERIM_DATA + 'y_train_ohe.pqt')
    y_validation_ohe = pd.read_parquet(INTERIM_DATA + 'y_validation_ohe.pqt')
    y_train_ordinal = pd.read_parquet(INTERIM_DATA + 'y_train_ordinal.pqt')
    y_validation_ordinal = pd.read_parquet(INTERIM_DATA + 'y_validation_ordinal.pqt')
    print('Parquet files loaded.')

except FileNotFoundError as e:
    print('Files were not found.')
    
print(f"""
X_train: {X_train.shape}
X_validation: {X_validation.shape}
X_kaggle_test: {X_kaggle_test.shape}

y_train_ohe: {y_train_ohe.shape}
y_validation_ohe: {y_validation_ohe.shape}

y_train_ordinal: {y_train_ordinal.shape}
y_validation_ordinal: {y_validation_ordinal.shape}
""")

X_train.tail(3)

Parquet files loaded.

X_train: (790244, 10)
X_validation: (87805, 10)
X_kaggle_test: (884262, 10)

y_train_ohe: (790244, 39)
y_validation_ohe: (87805, 39)

y_train_ordinal: (790244, 1)
y_validation_ordinal: (87805, 1)



Unnamed: 0,dates_year,dates_month,dates_hour,dates_day,is_daytime,day_of_week,pd_district,x,y,simplified_address
131932,2013,8,22,10,0,0.144286,0.136463,-122.426956,37.769247,0.039924
671155,2005,11,5,4,0,0.15231,0.102038,-122.386942,37.754168,0.776874
121958,2013,9,12,30,1,0.138624,0.17908,-122.408068,37.783992,0.039924


# 1st experiment: Multinomial Naive Bayes (baseline)

- Naive bayes. One model for each crime category.

In [5]:
X_train_mod = X_train.copy()
X_train_mod['x'] = X_train_mod['x'].abs()

model = MultinomialNB()
model.fit(X_train_mod, y_train_ordinal.values[:, 0])

y_pred = model.predict(X_validation)

In [6]:
# print(log_loss(y_validation_ordinal, y_pred, labels=np.arange(0,38).tolist()))

print(classification_report(y_validation_ordinal, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       168
           1       0.11      0.03      0.05      7632
           2       0.00      0.00      0.00        41
           3       0.00      0.00      0.00        28
           4       0.00      0.00      0.00      3762
           5       0.00      0.00      0.00       409
           6       0.00      0.00      0.00       212
           7       0.00      0.00      0.00      5437
           8       0.00      0.00      0.00       421
           9       0.00      0.00      0.00        91
          10       0.00      0.00      0.00        29
          11       0.00      0.00      0.00        44
          12       0.00      0.00      0.00      1043
          13       0.00      0.00      0.00      1621
          14       0.00      0.00      0.00        14
          15       0.00      0.00      0.00       232
          16       0.21      0.84      0.34     17455
          17       0.00    

## Kaggle's test dataset

In [7]:
# Predicting the probabilities from X_kaggle_test.
y_kaggle_pred = model.predict_proba(X_kaggle_test).round(2)
y_kaggle_pred = pd.DataFrame(y_kaggle_pred, columns=lst_columns).reset_index().rename(columns={'index': 'Id'})
y_kaggle_pred.head(3)

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.0,0.04,0.0,0.0,0.02,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.36,0.0,0.0,0.02,0.07,0.1,0.0,0.01,0.0,0.02,0.0,0.01,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.05,0.13,0.03,0.01
1,1,0.0,0.04,0.0,0.0,0.02,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.36,0.0,0.0,0.02,0.07,0.1,0.0,0.01,0.0,0.02,0.0,0.01,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.05,0.11,0.04,0.01
2,2,0.0,0.04,0.0,0.0,0.02,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.37,0.0,0.0,0.02,0.07,0.1,0.0,0.01,0.0,0.02,0.0,0.01,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.05,0.13,0.03,0.01


In [8]:
y_kaggle_pred.to_csv(PROCESSED_DATA + 'y_kaggle_pred_1st_prep_1st_model.zip', sep=',', index=False, compression='zip')

### Kaggle's score:

Score: 3.70134

# 2nd experiment: Random Forest with default parameters

In [9]:
model = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=2)
model.fit(X_train, y_train_ordinal.values[:, 0])

y_pred = model.predict(X_validation)

print(classification_report(y_validation_ordinal, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.10      0.04      0.05       168
           1       0.24      0.23      0.23      7632
           2       0.00      0.00      0.00        41
           3       0.00      0.00      0.00        28
           4       0.19      0.10      0.13      3762
           5       0.03      0.02      0.02       409
           6       0.04      0.02      0.03       212
           7       0.38      0.45      0.42      5437
           8       0.02      0.01      0.01       421
           9       0.00      0.00      0.00        91
          10       0.00      0.00      0.00        29
          11       0.03      0.02      0.02        44
          12       0.19      0.10      0.13      1043
          13       0.10      0.06      0.08      1621
          14       0.40      0.14      0.21        14
          15       0.04      0.03      0.04       232
          16       0.39      0.61      0.48     17455
          17       0.15    

## Kaggle's test dataset

In [10]:
# Predicting the probabilities from X_kaggle_test.
y_kaggle_pred = model.predict_proba(X_kaggle_test).round(2)
y_kaggle_pred = pd.DataFrame(y_kaggle_pred, columns=lst_columns).reset_index().rename(columns={'index': 'Id'})
y_kaggle_pred.head(3)

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.01,0.19,0.0,0.0,0.05,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.04,0.07,0.1,0.0,0.0,0.0,0.03,0.0,0.01,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.12,0.14,0.02,0.0
1,1,0.0,0.05,0.0,0.0,0.0,0.0,0.01,0.04,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.44,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.06,0.04,0.17,0.09,0.03
2,2,0.03,0.04,0.0,0.0,0.04,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.35,0.0,0.0,0.02,0.08,0.23,0.0,0.0,0.0,0.03,0.0,0.0,0.02,0.0,0.0,0.0,0.05,0.0,0.0,0.01,0.06,0.01,0.0


In [11]:
y_kaggle_pred.to_csv(PROCESSED_DATA + 'y_kaggle_pred_1st_prep_2nd_model.zip', sep=',', index=False, compression='zip')

# 3rd experiment: Multinomial Naive Bayes (baseline) with cyclical features after sine/cosine transformation

## Data import

In [7]:
try:
    X_train = pd.read_parquet(INTERIM_DATA + 'X_train_transformed_2nd_dataprep.pqt')
    X_validation = pd.read_parquet(INTERIM_DATA + 'X_validation_transformed_2nd_dataprep.pqt')
    X_kaggle_test = pd.read_parquet(INTERIM_DATA + 'X_kaggle_test_transformed_transformed_2nd_dataprep.pqt')

    y_train_ordinal = pd.read_parquet(INTERIM_DATA + 'y_train_ordinal.pqt')
    y_validation_ordinal = pd.read_parquet(INTERIM_DATA + 'y_validation_ordinal.pqt')
    print('Parquet files loaded.')

except FileNotFoundError as e:
    print('Files were not found.')
    
print(f"""
X_train: {X_train.shape}
X_validation: {X_validation.shape}
X_kaggle_test: {X_kaggle_test.shape}

y_train_ordinal: {y_train_ordinal.shape}
y_validation_ordinal: {y_validation_ordinal.shape}
""")

X_train.tail(3)

Parquet files loaded.

X_train: (790244, 13)
X_validation: (87805, 13)
X_kaggle_test: (884262, 13)

y_train_ordinal: (790244, 1)
y_validation_ordinal: (87805, 1)



Unnamed: 0,dates_year,is_daytime,day_of_week,pd_district,x,y,simplified_address,dates_month_sin,dates_month_cos,dates_day_sin,dates_day_cos,dates_hour_sin,dates_hour_cos
131932,2013,0,0.144286,0.136463,-122.426956,37.769247,0.039924,-0.866025,-0.5,0.897805,-0.440394,-0.269797,0.962917
671155,2005,0,0.15231,0.102038,-122.386942,37.754168,0.776874,-0.5,0.8660254,0.724793,0.688967,0.979084,0.203456
121958,2013,1,0.138624,0.17908,-122.408068,37.783992,0.039924,-1.0,-1.83697e-16,-0.201299,0.97953,-0.136167,-0.990686


In [9]:
X_train_mod = X_train.copy()

for column in ['x', 'dates_month_sin', 'dates_month_cos', 'dates_day_sin', 'dates_day_cos', 'dates_hour_sin', 'dates_hour_cos']:
    X_train_mod[column] = X_train_mod[column].abs()

model = MultinomialNB()
model.fit(X_train_mod, y_train_ordinal.values[:, 0])

y_pred = model.predict(X_validation)

In [10]:
# print(log_loss(y_validation_ordinal, y_pred, labels=np.arange(0,38).tolist()))

print(classification_report(y_validation_ordinal, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       168
           1       0.00      0.00      0.00      7632
           2       0.00      0.00      0.00        41
           3       0.00      0.00      0.00        28
           4       0.00      0.00      0.00      3762
           5       0.00      0.00      0.00       409
           6       0.00      0.00      0.00       212
           7       0.00      0.00      0.00      5437
           8       0.00      0.00      0.00       421
           9       0.00      0.00      0.00        91
          10       0.00      0.00      0.00        29
          11       0.00      0.00      0.00        44
          12       0.00      0.00      0.00      1043
          13       0.00      0.00      0.00      1621
          14       0.00      0.00      0.00        14
          15       0.00      0.00      0.00       232
          16       0.20      1.00      0.33     17455
          17       0.00    

## Kaggle's test dataset

In [11]:
# Predicting the probabilities from X_kaggle_test.
y_kaggle_pred = model.predict_proba(X_kaggle_test).round(2)
y_kaggle_pred = pd.DataFrame(y_kaggle_pred, columns=lst_columns).reset_index().rename(columns={'index': 'Id'})
y_kaggle_pred.head(3)

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.0,0.09,0.0,0.0,0.04,0.0,0.0,0.05,0.01,0.0,0.0,0.0,0.01,0.02,0.0,0.0,0.21,0.0,0.0,0.03,0.1,0.14,0.0,0.01,0.0,0.03,0.0,0.01,0.01,0.0,0.01,0.0,0.04,0.0,0.01,0.06,0.06,0.04,0.01
1,1,0.0,0.09,0.0,0.0,0.03,0.0,0.0,0.06,0.01,0.0,0.0,0.0,0.01,0.02,0.0,0.0,0.2,0.0,0.0,0.02,0.11,0.15,0.0,0.01,0.0,0.03,0.0,0.01,0.01,0.0,0.01,0.0,0.03,0.0,0.01,0.05,0.05,0.05,0.01
2,2,0.0,0.09,0.0,0.0,0.04,0.0,0.0,0.05,0.01,0.0,0.0,0.0,0.01,0.02,0.0,0.0,0.21,0.0,0.0,0.03,0.1,0.14,0.0,0.01,0.0,0.03,0.0,0.01,0.01,0.0,0.01,0.0,0.04,0.0,0.01,0.06,0.06,0.04,0.01


In [14]:
y_kaggle_pred.to_csv(PROCESSED_DATA + 'y_kaggle_pred_2nd_prep_1st_model.zip', sep=',', index=False, compression='zip')

### Kaggle's score:

Score: 3.70134