# Part 1: Data Cleaning

In [2]:
# Import dependecies
import pandas as pd
import os
import csv

In [3]:
# Read in source data
csv_path = os.path.join("../data/Police_Department_Incident_Reports.csv")
crime_df = pd.read_csv(csv_path)

# Show first 10 rows of DataFrame
crime_df.head(10)

Unnamed: 0,Incident Datetime,Incident Date,Incident Time,Incident Year,Incident Day of Week,Report Datetime,Row ID,Incident ID,Incident Number,CAD Number,...,Longitude,Point,Neighborhoods,ESNCAG - Boundary File,Central Market/Tenderloin Boundary Polygon - Updated,Civic Center Harm Reduction Project Boundary,HSOC Zones as of 2018-06-05,Invest In Neighborhoods (IIN) Areas,Current Supervisor Districts,Current Police Districts
0,2023/03/13 11:41:00 PM,2023/03/13,23:41,2023,Monday,2023/03/13 11:41:00 PM,125373607041,1253736,230167874,,...,,,,,,,,,,
1,2023/03/01 05:02:00 AM,2023/03/01,05:02,2023,Wednesday,2023/03/11 03:40:00 PM,125379506374,1253795,236046151,,...,,,,,,,,,,
2,2023/03/13 01:16:00 PM,2023/03/13,13:16,2023,Monday,2023/03/13 01:17:00 PM,125357107041,1253571,220343896,,...,,,,,,,,,,
3,2023/03/13 10:59:00 AM,2023/03/13,10:59,2023,Monday,2023/03/13 11:00:00 AM,125355107041,1253551,230174885,,...,,,,,,,,,,
4,2023/03/14 06:44:00 PM,2023/03/14,18:44,2023,Tuesday,2023/03/14 06:45:00 PM,125402407041,1254024,230176728,,...,,,,,,,,,,
5,2023/02/15 03:00:00 AM,2023/02/15,03:00,2023,Wednesday,2023/03/11 04:55:00 PM,125378606372,1253786,236046123,,...,,,,,,,,,,
6,2023/03/11 12:30:00 PM,2023/03/11,12:30,2023,Saturday,2023/03/12 04:15:00 PM,125381606244,1253816,236046004,,...,,,,,,,,,,
7,2023/03/13 11:26:00 AM,2023/03/13,11:26,2023,Monday,2023/03/13 01:37:00 PM,125419506244,1254195,236046850,,...,,,,,,,,,,
8,2023/03/11 03:00:00 PM,2023/03/11,15:00,2023,Saturday,2023/03/13 08:29:00 AM,125420606244,1254206,236045937,,...,,,,,,,,,,
9,2023/03/11 02:00:00 PM,2023/03/11,14:00,2023,Saturday,2023/03/15 11:21:00 AM,125431804134,1254318,230182844,230741133.0,...,-122.454285,POINT (-122.45428511766733 37.772895177200766),,,,,,,4.0,7.0


In [4]:
# Check that data was successfully imported by counting rows
num_rows_start = len(crime_df)
print("Number of rows in the DataFrame:", num_rows_start)

Number of rows in the DataFrame: 850895


In [5]:
# Determine datatypes for each column
print(crime_df.dtypes)

Incident Datetime                                        object
Incident Date                                            object
Incident Time                                            object
Incident Year                                             int64
Incident Day of Week                                     object
Report Datetime                                          object
Row ID                                                    int64
Incident ID                                               int64
Incident Number                                           int64
CAD Number                                              float64
Report Type Code                                         object
Report Type Description                                  object
Filed Online                                             object
Incident Code                                             int64
Incident Category                                        object
Incident Subcategory                    

In [6]:
# Cross-reference crime_df to drop unnecessary columns
print("Original DataFrame:")
print("-------------------")
print(crime_df.dtypes)

columns_to_drop = ['Report Datetime',
                   'Row ID',
                   'Incident ID',
                   'Incident Number',
                   'CAD Number',
                   'Report Type Code',
                   'Report Type Description',
                   'Filed Online',
                   'Incident Code',
                   'Intersection',
                   'CNN',
                   'Police District',
                   'Supervisor District',
                   'Supervisor District 2012',
                   'Point',
                   'ESNCAG - Boundary File',
                   'Central Market/Tenderloin Boundary Polygon - Updated',
                   'Civic Center Harm Reduction Project Boundary',
                   'HSOC Zones as of 2018-06-05',
                   'Invest In Neighborhoods (IIN) Areas',
                   'Current Supervisor Districts',
                   'Current Police Districts',
                   ]

crime_new_df = crime_df.drop(columns=columns_to_drop)

print("\nNew DataFrame:")
print("-------------------")
print(crime_new_df.dtypes)


Original DataFrame:
-------------------
Incident Datetime                                        object
Incident Date                                            object
Incident Time                                            object
Incident Year                                             int64
Incident Day of Week                                     object
Report Datetime                                          object
Row ID                                                    int64
Incident ID                                               int64
Incident Number                                           int64
CAD Number                                              float64
Report Type Code                                         object
Report Type Description                                  object
Filed Online                                             object
Incident Code                                             int64
Incident Category                                        object


In [7]:
# View new DataFrame with removed columns
crime_new_df.head(5)

Unnamed: 0,Incident Datetime,Incident Date,Incident Time,Incident Year,Incident Day of Week,Incident Category,Incident Subcategory,Incident Description,Resolution,Analysis Neighborhood,Latitude,Longitude,Neighborhoods
0,2023/03/13 11:41:00 PM,2023/03/13,23:41,2023,Monday,Recovered Vehicle,Recovered Vehicle,"Vehicle, Recovered, Auto",Open or Active,,,,
1,2023/03/01 05:02:00 AM,2023/03/01,05:02,2023,Wednesday,Larceny Theft,Larceny Theft - Other,"Theft, Other Property, >$950",Open or Active,,,,
2,2023/03/13 01:16:00 PM,2023/03/13,13:16,2023,Monday,Recovered Vehicle,Recovered Vehicle,"Vehicle, Recovered, Auto",Open or Active,,,,
3,2023/03/13 10:59:00 AM,2023/03/13,10:59,2023,Monday,Recovered Vehicle,Recovered Vehicle,"Vehicle, Recovered, Auto",Open or Active,,,,
4,2023/03/14 06:44:00 PM,2023/03/14,18:44,2023,Tuesday,Recovered Vehicle,Recovered Vehicle,"Vehicle, Recovered, Auto",Open or Active,,,,


In [8]:
# Remove rows with null values from our new DataFrame
crime_new_df = crime_new_df.dropna()
crime_new_df.head(5)

Unnamed: 0,Incident Datetime,Incident Date,Incident Time,Incident Year,Incident Day of Week,Incident Category,Incident Subcategory,Incident Description,Resolution,Analysis Neighborhood,Latitude,Longitude,Neighborhoods
11,2022/06/27 12:00:00 PM,2022/06/27,12:00,2022,Monday,Lost Property,Lost Property,Lost Property,Open or Active,Financial District/South Beach,37.787359,-122.408227,19.0
13,2023/03/16 05:30:00 PM,2023/03/16,17:30,2023,Thursday,Assault,Simple Assault,Battery,Open or Active,Potrero Hill,37.76229,-122.401324,54.0
33,2023/03/21 03:50:00 PM,2023/03/21,15:50,2023,Tuesday,Non-Criminal,Non-Criminal,Aided Case,Open or Active,Tenderloin,37.787038,-122.418271,50.0
61,2021/08/22 09:40:00 AM,2021/08/22,09:40,2021,Sunday,Warrant,Other,Probation Search,Open or Active,Pacific Heights,37.793977,-122.429804,102.0
87,2022/07/02 10:53:00 PM,2022/07/02,22:53,2022,Saturday,Assault,Simple Assault,Battery,Open or Active,Bayview Hunters Point,37.719298,-122.39002,88.0


In [9]:
# Count rows remaining after dropping null values
new_num_rows = len(crime_new_df)
print("Number of rows in the DataFrame:", new_num_rows)

Number of rows in the DataFrame: 786835


In [18]:
# Find the different incident types, counting each occurance in dataset
all_unique_counts = crime_new_df["Incident Category"].value_counts()
all_unique_incidents_df = pd.DataFrame(all_unique_counts.items(), columns=["Unique_Values", "Counts"])

# Display incident categories and their counts
all_unique_incidents_df

Unnamed: 0,Unique_Values,Counts
0,Larceny Theft,227727
1,Other Miscellaneous,56346
2,Malicious Mischief,53872
3,Assault,50745
4,Non-Criminal,47792
5,Burglary,45772
6,Motor Vehicle Theft,44600
7,Fraud,26416
8,Recovered Vehicle,24087
9,Warrant,23772


In [11]:
# Export unique incident DataFrame as a csv and store in 'data' folder
all_unique_incidents_df.to_csv("../data/incident_categories.csv", index=False)

In [12]:
# List of incident categories to keep 
keep_categories = ['Larceny Theft',
                   'Malicious Mischief',
                   'Assault',
                   'Burglary',
                   'Motor Vehicle Theft',
                   'Drug Offense',
                   'Robbery',
                   'Missing Person',
                   'Offences Against The Family And Children',
                   'Weapons Offense',
                   'Arson',
                   'Vandalism',
                   'Sex Offense',
                   'Prostitution',
                   'Rape',
                   'Homicide',
                   'Human Trafficking (A), Commercial Sex Acts',
                   'Human Trafficking, Commercial Sex Acts',
                   'Human Trafficking (B), Involuntary Servitude']

In [17]:
# Create final DataFrame with the remaining rows thats incidents are listed as being one of the previous categories, count rows to ensure changes

# Count rows in initial DataFrame: crime_df
num_rows = len(crime_df)
print("Number of rows in the initial DataFrame:", num_rows)
print("-----------------------------------")

# Count rows in cleaned DataFrame: cleaned_crime_df
cleaned_crime_df = crime_new_df.loc[crime_new_df['Incident Category'].isin(keep_categories)]

cleaned_num_rows = len(cleaned_crime_df)
print("Number of rows in the final DataFrame:", cleaned_num_rows)

Number of rows in the initial DataFrame: 850895
-----------------------------------
Number of rows in the final DataFrame: 503415


In [19]:
select_unique_counts = cleaned_crime_df["Incident Category"].value_counts()
unique_incidents_df = pd.DataFrame(select_unique_counts.items(), columns=["Unique_Values", "Counts"])
unique_incidents_df

Unnamed: 0,Unique_Values,Counts
0,Larceny Theft,227727
1,Malicious Mischief,53872
2,Assault,50745
3,Burglary,45772
4,Motor Vehicle Theft,44600
5,Drug Offense,21096
6,Robbery,18512
7,Missing Person,17893
8,Offences Against The Family And Children,10975
9,Weapons Offense,5768


In [20]:
# Show cleaned_crime_df
cleaned_crime_df.head(5)

Unnamed: 0,Incident Datetime,Incident Date,Incident Time,Incident Year,Incident Day of Week,Incident Category,Incident Subcategory,Incident Description,Resolution,Analysis Neighborhood,Latitude,Longitude,Neighborhoods
13,2023/03/16 05:30:00 PM,2023/03/16,17:30,2023,Thursday,Assault,Simple Assault,Battery,Open or Active,Potrero Hill,37.76229,-122.401324,54.0
87,2022/07/02 10:53:00 PM,2022/07/02,22:53,2022,Saturday,Assault,Simple Assault,Battery,Open or Active,Bayview Hunters Point,37.719298,-122.39002,88.0
221,2021/07/07 08:18:00 AM,2021/07/07,08:18,2021,Wednesday,Assault,Simple Assault,Battery,Cite or Arrest Adult,Mission,37.753837,-122.418594,53.0
236,2021/06/04 09:40:00 AM,2021/06/04,09:40,2021,Friday,Assault,Simple Assault,Battery,Cite or Arrest Adult,Tenderloin,37.785893,-122.419739,20.0
291,2021/08/16 02:05:00 PM,2021/08/16,14:05,2021,Monday,Missing Person,Missing Person,Found Person,Open or Active,Potrero Hill,37.760026,-122.396284,54.0


In [21]:
# Export cleaned_crime_df as a csv to our data folder
cleaned_crime_df.to_csv("../data/sf_crime_data.csv", index=False)

In [22]:
# Separate the data into records by year, first, count the number of incidents per year
year_unique_counts = cleaned_crime_df["Incident Year"].value_counts()
year_unique_counts_df = pd.DataFrame(year_unique_counts.items(), columns=["Unique_Values", "Counts"])
year_unique_counts_df

Unnamed: 0,Unique_Values,Counts
0,2018,87225
1,2019,84347
2,2022,82478
3,2023,79050
4,2021,78053
5,2020,69827
6,2024,22435


In [23]:
# Set the seed value and the sample size
random_seed = 24      # the last two digits of the current year
sample_size = 1429    #1,429 records per year x 7 years =  10,003 records total

#group the incident data by Incident Year
grouped = cleaned_crime_df.groupby("Incident Year")

#create an empty DataFrame to store the sampled data
sampled_data = pd.DataFrame()

#now, iterate through each of the groups and sample the data
for year, group_data in grouped:
    sampled_year_data = group_data.sample(n=sample_size, random_state=random_seed)
    sampled_data = pd.concat([sampled_data, sampled_year_data])

# Reset the index of the final sampled DataFrame
sampled_data.reset_index(drop=True, inplace=True)
sampled_data.head(10)

Unnamed: 0,Incident Datetime,Incident Date,Incident Time,Incident Year,Incident Day of Week,Incident Category,Incident Subcategory,Incident Description,Resolution,Analysis Neighborhood,Latitude,Longitude,Neighborhoods
0,2018/06/15 10:05:00 PM,2018/06/15,22:05,2018,Friday,Missing Person,Missing Person,Found Person,Open or Active,Sunset/Parkside,37.7389,-122.485074,40.0
1,2018/08/24 12:00:00 AM,2018/08/24,00:00,2018,Friday,Larceny Theft,Larceny - From Vehicle,"Theft, From Locked Vehicle, >$950",Open or Active,Noe Valley,37.742764,-122.428806,84.0
2,2018/08/27 08:10:00 AM,2018/08/27,08:10,2018,Monday,Assault,Aggravated Assault,"Assault, Aggravated, W/ Other Weapon",Open or Active,Visitacion Valley,37.711915,-122.416,75.0
3,2018/07/20 06:00:00 PM,2018/07/20,18:00,2018,Friday,Larceny Theft,Larceny Theft - Shoplifting,"Theft, Shoplifting, >$950",Open or Active,North Beach,37.805825,-122.41195,99.0
4,2018/03/27 07:00:00 AM,2018/03/27,07:00,2018,Tuesday,Larceny Theft,Larceny Theft - Bicycle,"Theft, Bicycle, >$950",Open or Active,South of Market,37.774516,-122.413769,32.0
5,2018/12/01 04:15:00 PM,2018/12/01,16:15,2018,Saturday,Burglary,Burglary - Other,"Burglary, Non-residential, Unlawful Entry",Open or Active,Mission Bay,37.769866,-122.402404,33.0
6,2018/03/15 04:00:00 PM,2018/03/15,16:00,2018,Thursday,Larceny Theft,Larceny Theft - Other,"Theft, From Person, $200-$950 (other than Pick...",Open or Active,Outer Mission,37.72224,-122.443793,80.0
7,2018/10/23 08:00:00 PM,2018/10/23,20:00,2018,Tuesday,Larceny Theft,Larceny Theft - Other,"Theft, Other Property, $50-$200",Open or Active,Financial District/South Beach,37.789264,-122.401375,108.0
8,2018/12/04 04:14:00 PM,2018/12/04,16:14,2018,Tuesday,Assault,Simple Assault,Battery,Open or Active,Outer Mission,37.724588,-122.43487,90.0
9,2018/07/26 02:24:00 PM,2018/07/26,14:24,2018,Thursday,Offences Against The Family And Children,Other,Violation of Restraining Order,Open or Active,Outer Richmond,37.777435,-122.50066,8.0


In [24]:
# The sampled data is what we'll use for the map - Export sampled data to CSV
sampled_data.to_csv("../data/sample_data_by_year.csv", index=False)