# Data preparation

## Library import

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline

## Parameter definition

In [41]:
RAW_DATA = '../data/raw/' 
EXTERNAL_DATA = '../data/external/' 
INTERIM_DATA = '../data/interim/' 
PROCESSED_DATA = '../data/processed/'
REFERENCES = '../references/'
RANDOM_STATE = 14

## Read data

In [42]:
df = pd.read_parquet(INTERIM_DATA + 'train.pqt')
df.head()

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,month,day,year,hour
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,5,13,2015,23
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,5,13,2015,23
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,5,13,2015,23


## First attempt

In the first attempt, which will became the baseline, I won't make any data preparation.

In [43]:
df_1 = df.copy()

In [44]:
df_1.to_parquet(PROCESSED_DATA + 'train_1.pqt', index=False)

## Second attempt

For this attempt I will:
 - Drop all duplicated rows
 - Create column "weekend" that shows if the crime happened during the week or the weekend

In [45]:
df_2 = df_1.copy()

In [46]:
df_2.duplicated().sum()

4658

In [47]:
df_2.drop_duplicates(keep='first', inplace=True)
df_2

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,month,day,year,hour
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,5,13,2015,23
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,5,13,2015,23
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,5,13,2015,23
...,...,...,...,...,...,...,...,...,...,...,...,...
878044,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056,1,6,2003,0
878045,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948,1,6,2003,0
878046,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266,1,6,2003,0
878047,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607,1,6,2003,0


In [48]:
df_2['Weekend'] = np.where((df_2['DayOfWeek']=='Sunday')|(df_2['DayOfWeek']=='Saturday'), 1, 0)
df_2.head(100)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,month,day,year,hour,Weekend
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23,0
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23,0
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,5,13,2015,23,0
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,5,13,2015,23,0
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,5,13,2015,23,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,OTHER OFFENSES,TRAFFIC VIOLATION,Wednesday,MISSION,"ARREST, BOOKED",25TH ST / SHOTWELL ST,-122.415015,37.750836,5,13,2015,18,0
97,BURGLARY,"BURGLARY OF APARTMENT HOUSE, UNLAWFUL ENTRY",Wednesday,SOUTHERN,NONE,0 Block of 6TH ST,-122.409504,37.781526,5,13,2015,18,0
98,LARCENY/THEFT,GRAND THEFT FROM UNLOCKED AUTO,Wednesday,BAYVIEW,NONE,ILLINOIS ST / 20TH ST,-122.387571,37.760550,5,13,2015,18,0
99,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,CASTRO ST / 16TH ST,-122.435318,37.764102,5,13,2015,18,0


In [49]:
df_2.to_parquet(PROCESSED_DATA + 'train_2.pqt', index=False)

## Third attempt

For this attempt I will:
 - Create column "day_period" that shows if the crime happened during dawn, morning, afternoon or night

In [50]:
df_3 = df_2.copy()

In [51]:
#Creating morning, afternoon, night and dawn
condlist = [df_3['hour']<6, df_3['hour']<12, df_3['hour']<18]
choicelist = ['dawn', 'morning', 'afternoon']
df_3['day_period'] = np.select(condlist, choicelist, 'night')
df_3['day_period'].value_counts()

afternoon    289873
night        279411
morning      178794
dawn         125313
Name: day_period, dtype: int64

In [52]:
df_3.to_parquet(PROCESSED_DATA + 'train_3.pqt', index=False)

In [53]:
df_3

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,month,day,year,hour,Weekend,day_period
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23,0,night
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23,0,night
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,5,13,2015,23,0,night
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,5,13,2015,23,0,night
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,5,13,2015,23,0,night
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878044,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056,1,6,2003,0,0,dawn
878045,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948,1,6,2003,0,0,dawn
878046,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266,1,6,2003,0,0,dawn
878047,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607,1,6,2003,0,0,dawn


## Fourth attempt

For this attempt I will:
 - Replace outliers longitude and latitude for mode values

In [54]:
df_4 = df_3.copy()
df_4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 873391 entries, 0 to 878048
Data columns (total 14 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Category    873391 non-null  object 
 1   Descript    873391 non-null  object 
 2   DayOfWeek   873391 non-null  object 
 3   PdDistrict  873391 non-null  object 
 4   Resolution  873391 non-null  object 
 5   Address     873391 non-null  object 
 6   X           873391 non-null  float64
 7   Y           873391 non-null  float64
 8   month       873391 non-null  int64  
 9   day         873391 non-null  int64  
 10  year        873391 non-null  int64  
 11  hour        873391 non-null  int64  
 12  Weekend     873391 non-null  int32  
 13  day_period  873391 non-null  object 
dtypes: float64(2), int32(1), int64(4), object(7)
memory usage: 96.6+ MB


In [55]:
#df_4 = df.copy()
outliers = (df['X'] > -122.35)|(df['Y'] > 37.82)
df_4.loc[outliers, ['X', 'Y']] = np.nan
#df.info()
outliers.value_counts()

False    877982
True         67
dtype: int64

In [56]:
df_4.query('X>-122')

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,month,day,year,hour,Weekend,day_period


In [57]:
df_4.to_parquet(PROCESSED_DATA + 'train_4.pqt', index=False)

In [58]:
df_4

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,month,day,year,hour,Weekend,day_period
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23,0,night
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23,0,night
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,5,13,2015,23,0,night
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,5,13,2015,23,0,night
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,5,13,2015,23,0,night
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878044,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056,1,6,2003,0,0,dawn
878045,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948,1,6,2003,0,0,dawn
878046,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266,1,6,2003,0,0,dawn
878047,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607,1,6,2003,0,0,dawn
