In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from path import Path
from uszipcode import Zipcode #pip install uszipcode



In [22]:
file_path = Path("../Resources/sanFranCrime.csv")
initial_df = pd.read_csv(file_path)
initial_df.head()

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,PdId
0,120058272,WEAPON LAWS,POSS OF PROHIBITED WEAPON,Friday,01/29/2016 12:00:00 AM,11:00,SOUTHERN,"ARREST, BOOKED",800 Block of BRYANT ST,-122.403405,37.775421,"(37.775420706711, -122.403404791479)",12005827212120
1,120058272,WEAPON LAWS,"FIREARM, LOADED, IN VEHICLE, POSSESSION OR USE",Friday,01/29/2016 12:00:00 AM,11:00,SOUTHERN,"ARREST, BOOKED",800 Block of BRYANT ST,-122.403405,37.775421,"(37.775420706711, -122.403404791479)",12005827212168
2,141059263,WARRANTS,WARRANT ARREST,Monday,04/25/2016 12:00:00 AM,14:59,BAYVIEW,"ARREST, BOOKED",KEITH ST / SHAFTER AV,-122.388856,37.729981,"(37.7299809672996, -122.388856204292)",14105926363010
3,160013662,NON-CRIMINAL,LOST PROPERTY,Tuesday,01/05/2016 12:00:00 AM,23:50,TENDERLOIN,NONE,JONES ST / OFARRELL ST,-122.412971,37.785788,"(37.7857883766888, -122.412970537591)",16001366271000
4,160002740,NON-CRIMINAL,LOST PROPERTY,Friday,01/01/2016 12:00:00 AM,00:30,MISSION,NONE,16TH ST / MISSION ST,-122.419672,37.76505,"(37.7650501214668, -122.419671780296)",16000274071000


In [23]:
#create separate dataframe for location data
geo_data_df = pd.DataFrame(data=initial_df, columns=["IncidntNum","Date","Address","X","Y","Location"])
geo_data_df.head()

Unnamed: 0,IncidntNum,Date,Address,X,Y,Location
0,120058272,01/29/2016 12:00:00 AM,800 Block of BRYANT ST,-122.403405,37.775421,"(37.775420706711, -122.403404791479)"
1,120058272,01/29/2016 12:00:00 AM,800 Block of BRYANT ST,-122.403405,37.775421,"(37.775420706711, -122.403404791479)"
2,141059263,04/25/2016 12:00:00 AM,KEITH ST / SHAFTER AV,-122.388856,37.729981,"(37.7299809672996, -122.388856204292)"
3,160013662,01/05/2016 12:00:00 AM,JONES ST / OFARRELL ST,-122.412971,37.785788,"(37.7857883766888, -122.412970537591)"
4,160002740,01/01/2016 12:00:00 AM,16TH ST / MISSION ST,-122.419672,37.76505,"(37.7650501214668, -122.419671780296)"


In [24]:
# Import/setup dependencies for uszipcode
# https://uszipcode.readthedocs.io/index.html#example-usage
# This took about 10 min to run FYI
from uszipcode import Zipcode, SearchEngine
search = SearchEngine(simple_zipcode=True)

def convertToZip(df):
    '''Creates a zipcode column from Y and X coordinates in a           dataframe. uszipcode returns multiple datapoints by default so      only selecting the zipcode in the return value'''
    result = search.by_coordinates(df['Y'], df['X'])
    return result[0].zipcode

geo_data_df['Zipcode'] = geo_data_df.apply(convertToZip, axis=1)


In [26]:
# Check our df to see if the Zipcodes look ok
geo_data_df.head(10)

Unnamed: 0,IncidntNum,Date,Address,X,Y,Location,Zipcode
0,120058272,01/29/2016 12:00:00 AM,800 Block of BRYANT ST,-122.403405,37.775421,"(37.775420706711, -122.403404791479)",94103
1,120058272,01/29/2016 12:00:00 AM,800 Block of BRYANT ST,-122.403405,37.775421,"(37.775420706711, -122.403404791479)",94103
2,141059263,04/25/2016 12:00:00 AM,KEITH ST / SHAFTER AV,-122.388856,37.729981,"(37.7299809672996, -122.388856204292)",94124
3,160013662,01/05/2016 12:00:00 AM,JONES ST / OFARRELL ST,-122.412971,37.785788,"(37.7857883766888, -122.412970537591)",94108
4,160002740,01/01/2016 12:00:00 AM,16TH ST / MISSION ST,-122.419672,37.76505,"(37.7650501214668, -122.419671780296)",94102
5,160002869,01/01/2016 12:00:00 AM,1700 Block of BUSH ST,-122.426077,37.788019,"(37.788018555829, -122.426077177375)",94109
6,160003130,01/02/2016 12:00:00 AM,MARY ST / HOWARD ST,-122.405721,37.780879,"(37.7808789360214, -122.405721454567)",94103
7,160003259,01/02/2016 12:00:00 AM,200 Block of EDDY ST,-122.411778,37.783981,"(37.7839805592634, -122.411778295992)",94103
8,160003970,01/02/2016 12:00:00 AM,4TH ST / BERRY ST,-122.393357,37.775788,"(37.7757876218293, -122.393357241451)",94158
9,160003641,01/01/2016 12:00:00 AM,100 Block of CAMERON WY,-122.387182,37.720967,"(37.7209669615499, -122.387181635995)",94124


In [4]:
#save geo_data_df as csv
geo_data_df.to_csv(r'/Users/bkirton/Desktop/sanFranCrimePredictor/Resources/geo_data.csv')

In [5]:
#drop location data, date, dayofweek from initial_df
initial_df = initial_df.drop(columns=["Address","X","Y","Location","Date","DayOfWeek"])
initial_df.head()

Unnamed: 0,IncidntNum,Category,Descript,Time,PdDistrict,Resolution,PdId
0,120058272,WEAPON LAWS,POSS OF PROHIBITED WEAPON,11:00,SOUTHERN,"ARREST, BOOKED",12005827212120
1,120058272,WEAPON LAWS,"FIREARM, LOADED, IN VEHICLE, POSSESSION OR USE",11:00,SOUTHERN,"ARREST, BOOKED",12005827212168
2,141059263,WARRANTS,WARRANT ARREST,14:59,BAYVIEW,"ARREST, BOOKED",14105926363010
3,160013662,NON-CRIMINAL,LOST PROPERTY,23:50,TENDERLOIN,NONE,16001366271000
4,160002740,NON-CRIMINAL,LOST PROPERTY,00:30,MISSION,NONE,16000274071000


In [6]:
#remaining columns will be our features. Target column is resolution
initial_df.nunique()

IncidntNum    116699
Category          39
Descript         726
Time            1439
PdDistrict        10
Resolution        14
PdId          150500
dtype: int64

In [7]:
#get number of values for resolution column
initial_df["Resolution"].value_counts()

NONE                                      107780
ARREST, BOOKED                             39416
UNFOUNDED                                   1608
JUVENILE BOOKED                             1056
EXCEPTIONAL CLEARANCE                        371
ARREST, CITED                                144
CLEARED-CONTACT JUVENILE FOR MORE INFO        58
NOT PROSECUTED                                22
LOCATED                                       20
PSYCHOPATHIC CASE                             17
JUVENILE CITED                                 3
COMPLAINANT REFUSES TO PROSECUTE               2
JUVENILE DIVERTED                              2
PROSECUTED BY OUTSIDE AGENCY                   1
Name: Resolution, dtype: int64

In [8]:
initial_df["Descript"].value_counts()

GRAND THEFT FROM LOCKED AUTO                      17741
LOST PROPERTY                                      4596
AIDED CASE, MENTAL DISTURBED                       4566
PETTY THEFT OF PROPERTY                            4416
MALICIOUS MISCHIEF, VANDALISM                      4262
                                                  ...  
POLICE BROADCAST, INTERCEPTION TO COMMIT CRIME        1
PETTY THEFT MOTORCYCLE STRIP                          1
MONEY LAUNDERING                                      1
TRANSPORTATION OF METHADONE                           1
FORCIBLE RAPE, ARMED WITH A GUN                       1
Name: Descript, Length: 726, dtype: int64

In [9]:
#bucket everything besides NONE and arrest,booked for now to create binary classification

#bucket resolutions
resolution_counts = initial_df["Resolution"].value_counts()

replace_resolution = list(resolution_counts[resolution_counts<1700].index)

for resolution in replace_resolution:
    initial_df["Resolution"] = initial_df["Resolution"].replace(resolution, "other")
    
initial_df["Resolution"].value_counts()

NONE              107780
ARREST, BOOKED     39416
other               3304
Name: Resolution, dtype: int64

Out of 150,500 rows only 3,304 are "other" after bucketing the "Resolution" column. This is only 2% of our overall data. These "other" values will be dropped in order to use the "Resolution" column as a our binary classifier for machine learning. 

In [10]:
#drop "other" values from resolution column and remove other non-essential columns
cleaned_df = initial_df[initial_df.Resolution != "other"]
cleaned_df = cleaned_df.drop(columns=["IncidntNum", "PdId","Descript","Time"])
cleaned_df = cleaned_df.dropna()
cleaned_df.head()

Unnamed: 0,Category,PdDistrict,Resolution
0,WEAPON LAWS,SOUTHERN,"ARREST, BOOKED"
1,WEAPON LAWS,SOUTHERN,"ARREST, BOOKED"
2,WARRANTS,BAYVIEW,"ARREST, BOOKED"
3,NON-CRIMINAL,TENDERLOIN,NONE
4,NON-CRIMINAL,MISSION,NONE


In [13]:
#save cleaned_df as csv
cleaned_df.to_csv(r'/Users/bkirton/Desktop/sanFranCrimePredictor/Resources/cleaned_df.csv')