In [1]:
#******************************************
#** Data Boocamp January 2024 cohort     **
#**                                      **
#** Project #3: San Francisco Crime Data **
#**                                      **
#** M. Bowman, B. Lee, S. Kikuchi, M. Dai**
#** J. Wang, J. Bein                     **
#**                                      **
#******************************************

#**********************************************
#** use this file to impoort the source data,**
#** filter and clean it.                     **
#**********************************************

#***************************
#** last edited 20MAY2024 **
#***************************

In [2]:
#*****************************
#** import the dependencies **
#*****************************

import pandas as pd               #this will be needed to manipulate dataframes
import os                         #this module used to create file paths across operating systems
import csv                        #this module is used for reading CSV files

In [3]:
#**********************
#** Proof of concept **
#**********************

#******************************************
#** read the source data into a variable **
#******************************************

# csvpath = os.path.join("PD_test_01.csv")    #a variable to hold the path to the CSV file

# with open(csvpath) as crime_data_csv:

#     # CSV reader specifies delimiter and variable that holds contents
#     csvreader = csv.reader(crime_data_csv, delimiter=',')

#     print(csvreader)

#     # Read the header row first (skip this step if there is no header)
#     csv_header = next(csvreader)
#     print(f"CSV Header: {csv_header}")

#     # Read each row of data after the header
#     # for row in csvreader:
#     #     print(row)

In [4]:
#**********************
#** Proof of concept **
#**********************

#**************************************************
#** read the source data into a Pandas dataframe **
#**************************************************

#csvpath = os.path.join("PD_test_01.csv")    #a variable to hold the path to the CSV file
csvpath = os.path.join("Police_Department_Incident_Reports.csv")    #a variable to hold the path to the CSV file
crime_df = pd.read_csv(csvpath)      #the dataframe holding the original (sample) set of data

# Display the first few rows of the DataFrame
print(crime_df.head())

        Incident Datetime Incident Date Incident Time  Incident Year  \
0  2023/03/13 11:41:00 PM    2023/03/13         23:41           2023   
1  2023/03/01 05:02:00 AM    2023/03/01         05:02           2023   
2  2023/03/13 01:16:00 PM    2023/03/13         13:16           2023   
3  2023/03/13 10:59:00 AM    2023/03/13         10:59           2023   
4  2023/03/14 06:44:00 PM    2023/03/14         18:44           2023   

  Incident Day of Week         Report Datetime        Row ID  Incident ID  \
0               Monday  2023/03/13 11:41:00 PM  125373607041      1253736   
1            Wednesday  2023/03/11 03:40:00 PM  125379506374      1253795   
2               Monday  2023/03/13 01:17:00 PM  125357107041      1253571   
3               Monday  2023/03/13 11:00:00 AM  125355107041      1253551   
4              Tuesday  2023/03/14 06:45:00 PM  125402407041      1254024   

   Incident Number  CAD Number  ... Longitude Point Neighborhoods  \
0        230167874         NaN  ...

In [5]:
#**************************************************
# check that the data was imported successfully  **
#**************************************************

num_rows = len(crime_df)
print("Number of rows in the DataFrame:", num_rows)

Number of rows in the DataFrame: 850895


In [6]:
#*******************************************
#** determine the datatypes in each column**
#*******************************************
print(crime_df.dtypes)

Incident Datetime                                        object
Incident Date                                            object
Incident Time                                            object
Incident Year                                             int64
Incident Day of Week                                     object
Report Datetime                                          object
Row ID                                                    int64
Incident ID                                               int64
Incident Number                                           int64
CAD Number                                              float64
Report Type Code                                         object
Report Type Description                                  object
Filed Online                                             object
Incident Code                                             int64
Incident Category                                        object
Incident Subcategory                    

This section describes the columns of data. Descriptions were obtained from
https://data.sfgov.org/Public-Safety/Police-Department-Incident-Reports-2018-to-Present/wg3w-h783/about_data

Incident Datetime                       The date and time when the incident occurred.

Incident Date                           The date the incident occured.

Incident Time                           The time the incident occured.

Incident Year                           The year the incident occured, provided as a convenience for filtering.

Incident Day of Week                    The day of the week the incident occured.

Report Datetime                         Distinct from Incident Datetime, Report Datetime is when the report was filed.

Row ID                                  A unique identifier for each row of data in the dataset.

Incident ID	                            This is the system generated identifier for incident reports. Incident IDs and Incident Numbers both                                                uniquely identify reports, but Incident Numbers are used when referencing cases and report documents.

Incident Number                         The number issued on the report, sometimes interchangeably referred to as the Case Number. This number is
                                        used to reference cases and report documents.
                                        
CAD Number                              The Computer Aided Dispatch (CAD) is the system used by the Department of Emergency Management (DEM) to
                                        dispatch officers and other public safety personnel. CAD Numbers are assigned by the DEM system and
                                        linked to relevant incident reports (Incident Number). Not all Incidents will have a CAD Number. Those
                                        filed online via Coplogic (refer to “Filed Online” field) and others not filed through the DEM system
                                        will not have CAD Numbers.
                                        
Report Type Code                        A system code for report types, these have corresponding descriptions within the dataset.

Report Type Description                 The description of the report type, can be one of: Initial; Initial Supplement; Vehicle Initial; 
                                        Vehicle Supplement; Coplogic Initial; Coplogic Supplement
                                        
Filed Online                            Non- emergency police reports can be filed online by members of the public using SFPD’s self-service
                                        reporting system called Coplogic Values in this field will be “TRUE” if Coplogic was used to file the 
                                        report. Please reference the link below for additional info: (http://sanfranciscopolice.org/reports).
                                        
Incident Code	                        Incident Codes are the system codes to describe a type of incident. A single incident report can have
                                        one or more incident types associated. In those cases you will see multiple rows representing a unique
                                        combination of the Incident ID and Incident Code.
                                        
Incident Category                       A category mapped on to the Incident Code used in statistics and reporting. Mappings provided by the
                                        Crime Analysis Unit of the Police Department.
                                        
Incident Subcategory                    A subcategory mapped to the Incident Code that is used for statistics and reporting. Mappings are
                                        provided by the Crime Analysis Unit of the Police Department.

Incident Description	                The description of the incident that corresponds with the Incident Code. These are generally 
                                        self-explanatory.

Resolution                              The resolution of the incident at the time of the report. Can be one of: • Cite or Arrest Adult • Cite or
                                        Arrest Juvenile* • Exceptional Adult • Exceptional Juvenile* • Open or Active • Unfounded Note: once a
                                        report is filed, the Resolution will not change. Status changes and/or updates must be provided using a
                                        Supplemental Report *Incidents identifying juvenile information are not included in this dataset.
                                        Please see the Juvenile Data section for more information.
                                
Intersection                            The 2 or more street names that intersect closest to the original incident separated by a backward slash 
                                        (\). Note, the possible intersections will only include those that satisfy the privacy controls.
                                     
CNN                                     The unique identifier of the intersection for reference back to other related basemap datasets. For more on
                                        the Centerline Node Network see 
                                        https://datasf.gitbook.io/draft-publishing-standards/standard-reference-data/basemap/street-centerlines-                                                                         nodes

Police District	                        The Police District where the incident occurred. District boundaries can be reviewed in the link below.
                                        Please note this field is entered by officers and not based on the point.
                                        Reference here: https://data.sfgov.org/d/wkhw-cjsf

Analysis Neighborhood                   This field is used to identify the neighborhood where each incident occurs. Neighborhoods and boundaries
                                        are defined by the Department of Public Health and the Mayor's Office of Housing and Community Development.
                                        Please reference the link below for additional info: https://data.sfgov.org/d/p5b7-5n3h Please note this
                                        boundary is assigned based on the intersection, it may differ from the boundary the incident actually                                                                            occurred within.

Supervisor District	                    Current Supervisor District: There are 11 members elected to the Board of Supervisors in San Francisco,
                                        each representing a geographic district. The Board of Supervisors is the legislative body for 
                                        San Francisco. The districts are numbered 1 through 11. Please reference the link below for additional
                                        info: https://data.sfgov.org/d/cqbw-m5m3 Please note this boundary is assigned based on the intersection,
                                        it may differ from the boundary the incident actually occurred within.

Supervisor District 2012                Previous 2012-2022 Supervisor District: There are 11 members elected to the Board of Supervisors in
                                        San Francisco, each representing a geographic district. The Board of Supervisors is the legislative body
                                        for San Francisco. The districts are numbered 1 through 11. Please reference the link below for additional
                                        info: https://data.sfgov.org/d/keex-zmn4 Please note this boundary is assigned based on the intersection,
                                        it may differ from the boundary the incident actually occurred within.

Latitude                                The latitude coordinate in WGS84, spatial reference is EPSG:4326

Longitude                               The longitude coordinate in WGS84, spatial reference is EPSG:4326

Point                                   Geolocation in OGC WKT format (e.g, POINT(37.4,-122.3)


The following fields were not defined on the website. TBD
Point                                                    
Neighborhoods                                           
ESNCAG - Boundary File                                  
Central Market/Tenderloin Boundary Polygon - Updated    
Civic Center Harm Reduction Project Boundary            
HSOC Zones as of 2018-06-05                             
Invest In Neighborhoods (IIN) Areas                     
Current Supervisor Districts                            
Current Police Districts                                


In [7]:
#***************************
#** drop unneeded columns: *
#***************************

print("Original DataFrame:")
print("-------------------")
print(crime_df.dtypes)

columns_to_drop = ['CAD Number',
                  'Supervisor District',
                  'Supervisor District 2012',
                  'ESNCAG - Boundary File',
                  'Central Market/Tenderloin Boundary Polygon - Updated',
                  'Civic Center Harm Reduction Project Boundary',
                  'HSOC Zones as of 2018-06-05',
                  'Invest In Neighborhoods (IIN) Areas',
                  'Current Supervisor Districts',
                  'Current Police Districts']

crime_new_df = crime_df.drop(columns=columns_to_drop)

print("\nNew DataFrame:")
print("-------------------")
print(crime_new_df.dtypes)


Original DataFrame:
-------------------
Incident Datetime                                        object
Incident Date                                            object
Incident Time                                            object
Incident Year                                             int64
Incident Day of Week                                     object
Report Datetime                                          object
Row ID                                                    int64
Incident ID                                               int64
Incident Number                                           int64
CAD Number                                              float64
Report Type Code                                         object
Report Type Description                                  object
Filed Online                                             object
Incident Code                                             int64
Incident Category                                        object


In [8]:
#*************************
#** check new DataFrame **
#*************************
crime_new_df.head(5)

Unnamed: 0,Incident Datetime,Incident Date,Incident Time,Incident Year,Incident Day of Week,Report Datetime,Row ID,Incident ID,Incident Number,Report Type Code,...,Incident Description,Resolution,Intersection,CNN,Police District,Analysis Neighborhood,Latitude,Longitude,Point,Neighborhoods
0,2023/03/13 11:41:00 PM,2023/03/13,23:41,2023,Monday,2023/03/13 11:41:00 PM,125373607041,1253736,230167874,VS,...,"Vehicle, Recovered, Auto",Open or Active,,,Out of SF,,,,,
1,2023/03/01 05:02:00 AM,2023/03/01,05:02,2023,Wednesday,2023/03/11 03:40:00 PM,125379506374,1253795,236046151,II,...,"Theft, Other Property, >$950",Open or Active,,,Mission,,,,,
2,2023/03/13 01:16:00 PM,2023/03/13,13:16,2023,Monday,2023/03/13 01:17:00 PM,125357107041,1253571,220343896,VS,...,"Vehicle, Recovered, Auto",Open or Active,,,Out of SF,,,,,
3,2023/03/13 10:59:00 AM,2023/03/13,10:59,2023,Monday,2023/03/13 11:00:00 AM,125355107041,1253551,230174885,VS,...,"Vehicle, Recovered, Auto",Open or Active,,,Out of SF,,,,,
4,2023/03/14 06:44:00 PM,2023/03/14,18:44,2023,Tuesday,2023/03/14 06:45:00 PM,125402407041,1254024,230176728,VS,...,"Vehicle, Recovered, Auto",Open or Active,,,Out of SF,,,,,


In [30]:
#*********************************************
#** obtain the unique incident descriptions **
#*********************************************

# unique_incidents = crime_new_df["Incident Category"].unique()
# unique_incidents_df = pd.DataFrame(unique_incidents, columns=["Incident Category"])
# unique_incidents_sorted_df=unique_incidents_df.sort_values(by="Incident Category")

unique_counts = crime_new_df["Incident Category"].value_counts()
unique_incidents_df = pd.DataFrame(unique_counts.items(), columns=["Unique_Values", "Counts"])

unique_incidents_df

Unnamed: 0,Unique_Values,Counts
0,Larceny Theft,256082
1,Other Miscellaneous,57782
2,Malicious Mischief,57493
3,Assault,52212
4,Non-Criminal,49720
5,Burglary,47194
6,Motor Vehicle Theft,46134
7,Recovered Vehicle,34420
8,Fraud,27712
9,Lost Property,24844


In [32]:
unique_incidents_df.to_csv("incident_categories.csv", index=False)

In [33]:
#*******************************************
#** separate the data into records by year**
#*******************************************

#**************************************************
#** first, count the number of incidents per year**
#**************************************************

year_unique_counts = crime_new_df["Incident Year"].value_counts()
year_unique_counts_df = pd.DataFrame(year_unique_counts.items(), columns=["Unique_Values", "Counts"])
year_unique_counts_df

Unnamed: 0,Unique_Values,Counts
0,2018,151551
1,2019,146670
2,2022,135504
3,2023,132411
4,2021,128252
5,2020,117420
6,2024,39087


In [37]:
#***************************************************
#** second, set the seed value and the sample size**
#***************************************************

random_seed = 24      # the last two digits of the current year
sample_size = 5000              #5,000 records

#group the incident data by Incident Year
grouped = crime_new_df.groupby("Incident Year")

#create an empty DataFrame to store the sampled data
sampled_data = pd.DataFrame()

#now, iterate through each of the groups and sample the data
for year, group_data in grouped:
    sampled_year_data = group_data.sample(n=sample_size, random_state=random_seed)
    sampled_data = pd.concat([sampled_data, sampled_year_data])

# Reset the index of the final sampled DataFrame
sampled_data.reset_index(drop=True, inplace=True)
sampled_data.head(10)

Unnamed: 0,Incident Datetime,Incident Date,Incident Time,Incident Year,Incident Day of Week,Report Datetime,Row ID,Incident ID,Incident Number,Report Type Code,...,Incident Description,Resolution,Intersection,CNN,Police District,Analysis Neighborhood,Latitude,Longitude,Point,Neighborhoods
0,2018/01/08 08:00:00 AM,2018/01/08,08:00,2018,Monday,2018/01/14 09:38:00 AM,62372709320,623727,180012785,IS,...,"Access Card, incl. Credit, Phone, ATM, Fraudul...",Open or Active,CLEMENT ST \ 21ST AVE,27539000.0,Richmond,Outer Richmond,37.782212,-122.48059,POINT (-122.48059001673039 37.7822123116016),8.0
1,2018/09/23 10:00:00 PM,2018/09/23,22:00,2018,Sunday,2018/10/04 10:55:00 AM,72203806301,722038,180751438,II,...,"Theft, From Building, <$50",Open or Active,HOOKER ALY \ MASON ST,25068000.0,Central,Nob Hill,37.79039,-122.410523,POINT (-122.41052277434086 37.79039016118251),50.0
2,2018/04/07 02:30:00 PM,2018/04/07,14:30,2018,Saturday,2018/04/10 10:38:00 AM,65529271000,655292,180264259,II,...,Lost Property,Open or Active,31ST AVE \ NORIEGA ST,27676000.0,Taraval,Sunset/Parkside,37.753674,-122.489324,POINT (-122.48932371166421 37.75367419233453),39.0
3,2018/10/22 08:00:00 PM,2018/10/22,20:00,2018,Monday,2022/10/22 08:57:00 PM,120697774000,1206977,220728129,II,...,Missing Adult,Open or Active,EDDY ST \ PIERCE ST,26068000.0,Northern,Western Addition,37.781009,-122.435616,POINT (-122.43561626610261 37.781009344703506),97.0
4,2018/02/10 10:00:00 PM,2018/02/10,22:00,2018,Saturday,2018/02/11 02:20:00 PM,63443007021,634430,180112971,VI,...,"Vehicle, Stolen, Auto",Open or Active,JACKSON ST \ JONES ST,25262000.0,Central,Nob Hill,37.794988,-122.414826,POINT (-122.41482563329623 37.79498815505522),16.0
5,2018/12/18 10:56:00 AM,2018/12/18,10:56,2018,Tuesday,2018/12/18 11:03:00 AM,74942672000,749426,180950981,II,...,Found Property,Open or Active,STEVELOE PL \ JONES ST,24927000.0,Tenderloin,Tenderloin,37.785354,-122.412881,POINT (-122.41288124058862 37.785353822100895),20.0
6,2018/04/08 09:36:00 AM,2018/04/08,09:36,2018,Sunday,2018/04/08 09:36:00 AM,65460162050,654601,180259056,II,...,"Warrant Arrest, Enroute To Outside Jurisdiction",Cite or Arrest Adult,17TH ST \ HOFF ST,24176000.0,Mission,Mission,37.763376,-122.420433,POINT (-122.42043281853437 37.76337555115752),53.0
7,2018/09/01 10:00:00 PM,2018/09/01,22:00,2018,Saturday,2018/09/25 04:46:00 PM,72092206244,720922,186223081,II,...,"Theft, From Locked Vehicle, >$950",Open or Active,,,Richmond,,,,,
8,2018/11/25 05:15:00 PM,2018/11/25,17:15,2018,Sunday,2018/11/25 05:46:00 PM,74141727195,741417,180892327,II,...,Trespassing,Open or Active,LURMONT TER \ LEAVENWORTH ST,25519000.0,Central,Russian Hill,37.801707,-122.417877,POINT (-122.41787748900317 37.80170674982928),107.0
9,2018/05/18 06:30:00 PM,2018/05/18,18:30,2018,Friday,2018/05/19 09:48:00 AM,67035406244,670354,186112917,II,...,"Theft, From Locked Vehicle, >$950",Open or Active,STEINER ST \ POST ST,26614000.0,Northern,Japantown,37.784944,-122.43473,POINT (-122.4347299934904 37.784944005025956),103.0


In [38]:
sampled_data.to_csv("sample_data_by_year.csv", index=False)