# Business and data understanding

## Purpose
This notebook contains the data preparation phase for the .

## Tasks
- [ ] Create a baseline model.
- [ ] Submit the test result.

# Setup

## Library import

In [35]:
from datetime import datetime 
import os
from pathlib import Path

from feature_engine.encoding import CountFrequencyEncoder, RareLabelEncoder, OneHotEncoder
import folium
import humps
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as ply

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

# Options for pandas and plotly
pd.options.display.max_columns = None
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 200
plotly.offline.init_notebook_mode(connected=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Local library import

In [10]:
if Path.cwd().name == 'notebooks':
    os.chdir('../')

from src.utils.data_describe import serie_nulos, cardinalidade
from src.data.data_preprocessing import (
    TransformCordinates, address_split, create_date_based_columns, snake_case_columns, create_simplified_address_column
)

if Path.cwd().name == 'sf_crime':
    os.chdir('./notebooks/')

## Parameter definition
We set all relevant parameters for our notebook. By convention, parameters are uppercase, while all the 
other variables follow Python's guidelines.

In [11]:
RAW_DATA = '../data/raw/' 
EXTERNAL_DATA = '../data/external/' 
INTERIM_DATA = '../data/interim/' 
PROCESSED_DATA = '../data/processed/'
REFERENCES = '../references/'
RANDOM_STATE = 42

## Data import
We retrieve all the required data for the analysis.

In [15]:
try:
    X_train = pd.read_parquet(INTERIM_DATA + 'X_train_transformed.pqt')
    X_test = pd.read_parquet(INTERIM_DATA + 'X_test_transformed.pqt')
    y_train_ohe = pd.read_parquet(INTERIM_DATA + 'y_train_ohe.pqt')
    y_test_ohe = pd.read_parquet(INTERIM_DATA + 'y_test_ohe.pqt')
    y_train_ordinal = pd.read_parquet(INTERIM_DATA + 'y_train_ordinal.pqt')
    y_test_ordinal = pd.read_parquet(INTERIM_DATA + 'y_test_ordinal.pqt')
    print('Parquet files loaded.')

except FileNotFoundError as e:
    print('Files were not found.')

print(f"""
X_train: {X_train.shape}
X_test: {X_test.shape}

y_train_ohe: {y_train.shape}
y_test_ohe: {y_test.shape}

y_train_ordinal: {y_train_ordinal.shape}
y_test_ordinal: {y_test_ordinal.shape}
""")

X_train.tail(3)

Parquet files loaded.

X_train: (790244, 10)
X_test: (87805, 10)

y_train_ohe: (790244, 39)
y_test_ohe: (87805, 39)

y_train_ordinal: (790244, 1)
y_test_ordinal: (87805, 1)



Unnamed: 0,dates_year,dates_month,dates_hour,dates_day,is_daytime,day_of_week,pd_district,x,y,simplified_address
131932,2013,8,22,10,0,0.144286,0.136463,-122.426956,37.769247,0.039924
671155,2005,11,5,4,0,0.15231,0.102038,-122.386942,37.754168,0.776874
121958,2013,9,12,30,1,0.138624,0.17908,-122.408068,37.783992,0.039924


# 1st experiment: baseline

- Naive bayes. One model for each crime category.

In [20]:
X_train['x'] = X_train['x'].abs()

In [34]:
model = MultinomialNB()
model.fit(X_train, y_train_ordinal.values[:, 0])

y_pred = model.predict(X_test)

In [40]:
print(classification_report(y_test_ordinal, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      5437
           1       0.11      0.03      0.05      7632
           2       0.00      0.00      0.00      3762
           3       0.00      0.00      0.00      1621
           4       0.00      0.00      0.00      4216
           5       0.15      0.20      0.17     12464
           6       0.00      0.00      0.00       470
           7       0.21      0.84      0.34     17455
           8       0.00      0.00      0.00       779
           9       0.00      0.00      0.00      3197
          10       0.00      0.00      0.00      5296
          11       0.00      0.00      0.00      9311
          12       0.00      0.00      0.00      1043
          13       0.00      0.00      0.00      2562
          14       0.00      0.00      0.00      4540
          15       0.00      0.00      0.00      2281
          16       0.00      0.00      0.00      1031
          17       0.00    