# San Francisco Crime Classification

## Library import

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline

# import geopandas as gpd
# from shapely.geometry import Point
# from geopandas import GeoDataFrame


## Parameter definition

In [55]:
RAW_DATA = '../data/raw/' 
EXTERNAL_DATA = '../data/external/' 
INTERIM_DATA = '../data/interim/' 
PROCESSED_DATA = '../data/processed/'
REFERENCES = '../references/'
RANDOM_STATE = 14

## Data fields

Data fields
 - Dates - timestamp of the crime incident
 - **Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.**
 - **Descript - detailed description of the crime incident (only in train.csv)**
 - DayOfWeek - the day of the week
 - PdDistrict - name of the Police Department District
 - **Resolution - how the crime incident was resolved (only in train.csv)**
 - Address - the approximate street address of the crime incident 
 - X - Longitude
 - Y - Latitude

Public holidays in San Francisco

 - New Year's Day: 1st January.
 - Martin Luther King Jr Day: Third Monday of January.
 - Presidents' Day: Third Monday in February.
 - Memorial Day: Last Monday in May.
 - Independence Day: 4th July.
 - Labor Day: First Monday in September.
 - Columbus Day: Second Monday in October.
 - Veterans' Day: 11th November.
 - Thanksgiving: Last Thursday in November.
 - Christmas Day: 25th December.

## Learning dataset

In [78]:
df = pd.read_csv(RAW_DATA + 'train.csv')
df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Dates       878049 non-null  object 
 1   Category    878049 non-null  object 
 2   Descript    878049 non-null  object 
 3   DayOfWeek   878049 non-null  object 
 4   PdDistrict  878049 non-null  object 
 5   Resolution  878049 non-null  object 
 6   Address     878049 non-null  object 
 7   X           878049 non-null  float64
 8   Y           878049 non-null  float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


In [58]:
df.isna().sum()

Dates         0
Category      0
Descript      0
DayOfWeek     0
PdDistrict    0
Resolution    0
Address       0
X             0
Y             0
dtype: int64

There is no null value on train dataset.

In [59]:
for c in df.columns:
    print(c)
    print(df[c].value_counts())
    print('--'*40)

Dates
2011-01-01 00:01:00    185
2006-01-01 00:01:00    136
2012-01-01 00:01:00     94
2006-01-01 12:00:00     63
2007-06-01 00:01:00     61
                      ... 
2012-06-26 00:10:00      1
2012-06-26 00:16:00      1
2012-06-26 00:55:00      1
2012-06-26 01:06:00      1
2009-02-19 08:00:00      1
Name: Dates, Length: 389257, dtype: int64
--------------------------------------------------------------------------------
Category
LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  

In [60]:
df.describe()

Unnamed: 0,X,Y
count,878049.0,878049.0
mean,-122.422616,37.77102
std,0.030354,0.456893
min,-122.513642,37.707879
25%,-122.432952,37.752427
50%,-122.41642,37.775421
75%,-122.406959,37.784369
max,-120.5,90.0


In [61]:
sum(df['Resolution'] == 'NONE')/len(df['Resolution'])*100

59.99551277889958

In [62]:
len(df['Category'].unique())

39

In [63]:
df_test3['Category'].values.tolist()

['LARCENY/THEFT',
 'OTHER OFFENSES',
 'NON-CRIMINAL',
 'ASSAULT',
 'DRUG/NARCOTIC',
 'VEHICLE THEFT',
 'VANDALISM',
 'WARRANTS',
 'BURGLARY',
 'SUSPICIOUS OCC',
 'MISSING PERSON',
 'ROBBERY',
 'FRAUD',
 'FORGERY/COUNTERFEITING',
 'SECONDARY CODES',
 'WEAPON LAWS',
 'PROSTITUTION',
 'TRESPASS',
 'STOLEN PROPERTY',
 'SEX OFFENSES FORCIBLE',
 'DISORDERLY CONDUCT',
 'DRUNKENNESS',
 'RECOVERED VEHICLE',
 'KIDNAPPING',
 'DRIVING UNDER THE INFLUENCE',
 'RUNAWAY',
 'LIQUOR LAWS',
 'ARSON',
 'LOITERING',
 'EMBEZZLEMENT',
 'SUICIDE',
 'FAMILY OFFENSES',
 'BAD CHECKS',
 'BRIBERY',
 'EXTORTION',
 'SEX OFFENSES NON FORCIBLE',
 'GAMBLING',
 'PORNOGRAPHY/OBSCENE MAT',
 'TREA']

In [64]:
# df_test3 = pd.DataFrame()
# df_test3['Prop'] = df[['Category']].value_counts()/df[['Category']].value_counts().sum()
# df_test3['Cumsum'] = df_test3['Prop'].cumsum()
# df_test3 = df_test3.reset_index()
# fig, ax = plt.subplots(figsize = (15, 6))
# df_test3['Cumsum'].plot(kind='bar')
# plt.xlabel(df_test3['Category'].values, rotation=90);
# #df_test3

In [65]:
#TODO: improve this code.

df_test = df['Category'].value_counts()

df_test2 = []
sumcum = 0
for i in range(len(df_test)):
    sumcum = sumcum + df_test[i]
    df_test2.append(sumcum/len(df['Category'])*100)

df_test2

[19.919161686876244,
 34.28988587197298,
 44.80228324387363,
 53.557603277265855,
 59.70429896281415,
 65.82935576488327,
 70.92303504701901,
 75.73073940064849,
 79.91672446526333,
 83.49442912639272,
 86.45428671976165,
 89.07373050934515,
 90.97328281223486,
 92.18152973239533,
 93.31871000365584,
 94.29302920452048,
 95.14537343587887,
 95.97972322729142,
 96.49677865358312,
 96.99652297309149,
 97.48852285009151,
 97.97596717267487,
 98.33335041666238,
 98.59996423889783,
 98.85826417432284,
 99.0798918966937,
 99.29662239806663,
 99.46893624387705,
 99.60845009788748,
 99.74124450913332,
 99.799100050225,
 99.85501948068958,
 99.90125835801874,
 99.93417223867917,
 99.96332778694583,
 99.9801833382875,
 99.99681111190833,
 99.9993166668375,
 100.0]

In [66]:
# df_test3 = df_test3.sort_values(ascending=False)
# df_test3['cumperc'] = df_test3.cumsum()/df_test3.sum()*100
# df_test3

#TODO: create a summary with first impressions on dataset

- At Resolution column almost 60% of the data is NONE.
- There are 39 uniques types of Category.
- 9 categories correspond to more than 80 of the crimes. 

## Date in dataset

**Transforming Dates column from object to datetime**

In [79]:
df['Dates'] = pd.to_datetime(df['Dates'])
df['Dates']

0        2015-05-13 23:53:00
1        2015-05-13 23:53:00
2        2015-05-13 23:33:00
3        2015-05-13 23:30:00
4        2015-05-13 23:30:00
                 ...        
878044   2003-01-06 00:15:00
878045   2003-01-06 00:01:00
878046   2003-01-06 00:01:00
878047   2003-01-06 00:01:00
878048   2003-01-06 00:01:00
Name: Dates, Length: 878049, dtype: datetime64[ns]

In [68]:
#df['month'] = df['Dates'].apply(lambda dt: dt.month)
#df['day'] = df['Dates'].apply(lambda dt: dt.day)
#df['year'] = df['Dates'].apply(lambda dt: dt.year)
#df

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Category    878049 non-null  object 
 1   Descript    878049 non-null  object 
 2   DayOfWeek   878049 non-null  object 
 3   PdDistrict  878049 non-null  object 
 4   Resolution  878049 non-null  object 
 5   Address     878049 non-null  object 
 6   X           878049 non-null  float64
 7   Y           878049 non-null  float64
 8   month       878049 non-null  int64  
 9   day         878049 non-null  int64  
 10  year        878049 non-null  int64  
 11  time        878049 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 80.4+ MB


In [80]:
df['month'] = df['Dates'].dt.month
df['day'] = df['Dates'].dt.day
df['year'] = df['Dates'].dt.year
df['hour'] = df['Dates'].dt.hour
#df['minute'] = df['Dates'].dt.minute
#df['time'] = df['Dates'].dt.time
df

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,month,day,year,hour
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,5,13,2015,23
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,5,13,2015,23
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,5,13,2015,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...
878044,2003-01-06 00:15:00,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056,1,6,2003,0
878045,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948,1,6,2003,0
878046,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266,1,6,2003,0
878047,2003-01-06 00:01:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607,1,6,2003,0


## Day of week

In [70]:
df['DayOfWeek'].value_counts()

Friday       133734
Wednesday    129211
Saturday     126810
Thursday     125038
Tuesday      124965
Monday       121584
Sunday       116707
Name: DayOfWeek, dtype: int64

#TODO: what information can I get from day of week?

## Category

In [71]:
df['Category'].value_counts()

LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7326
STOLEN PROPERTY                  4540
SEX OFFENSES FORCIBLE            4388
DISORDERLY CONDUCT               4320
DRUNKENNESS                      4280
RECOVERED VEHICLE                3138
KIDNAPPING                       2341
DRIVING UNDER THE INFLUENCE      2268
RUNAWAY                          1946
LIQUOR LAWS 

## Latitude (y) and longitude (x)

San Francisco Latitude and longitude coordinates are: 37.773972, -122.431297

 - southernmost position: 37.708086
 - northernmost position: 37.811324
 - westernmost position: -122.514675
 - easternmost position: -122.356859
 
 Source: Google Maps

In [72]:
df[['X', 'Y']].describe()

Unnamed: 0,X,Y
count,878049.0,878049.0
mean,-122.422616,37.77102
std,0.030354,0.456893
min,-122.513642,37.707879
25%,-122.432952,37.752427
50%,-122.41642,37.775421
75%,-122.406959,37.784369
max,-120.5,90.0


In [74]:
# geometry = [Point(xy) for xy in zip(train_data['X'], train_data['Y'])]
# gdf = GeoDataFrame(train_data, geometry=geometry)

# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# gdf.plot(ax=world.plot(), marker='o', color='red')
# plt.show()

## Exporting dataset

In [81]:
df.drop('Dates', axis=1, inplace=True)
df

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,month,day,year,hour
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,5,13,2015,23
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,5,13,2015,23
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,5,13,2015,23
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,5,13,2015,23
...,...,...,...,...,...,...,...,...,...,...,...,...
878044,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056,1,6,2003,0
878045,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948,1,6,2003,0
878046,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266,1,6,2003,0
878047,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607,1,6,2003,0


In [82]:
df.to_parquet(INTERIM_DATA + 'train.pqt', index=False)