[Download The Data](https://www.kaggle.com/datasets/mchirico/philadelphiacrimedata)

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
# get the data
df = pd.read_csv("crime.csv")

In [3]:
# check out the data
df.head()

Unnamed: 0,Dc_Dist,Psa,Dispatch_Date_Time,Dispatch_Date,Dispatch_Time,Hour,Dc_Key,Location_Block,UCR_General,Text_General_Code,Police_Districts,Month,Lon,Lat
0,18,3,2009-10-02 14:24:00,2009-10-02,14:24:00,14,200918067518,S 38TH ST / MARKETUT ST,800.0,Other Assaults,,2009-10,,
1,14,1,2009-05-10 00:55:00,2009-05-10,00:55:00,0,200914033994,8500 BLOCK MITCH,2600.0,All Other Offenses,,2009-05,,
2,25,J,2009-08-07 15:40:00,2009-08-07,15:40:00,15,200925083199,6TH CAMBRIA,800.0,Other Assaults,,2009-08,,
3,35,D,2009-07-19 01:09:00,2009-07-19,01:09:00,1,200935061008,5500 BLOCK N 5TH ST,1500.0,Weapon Violations,20.0,2009-07,-75.130477,40.036389
4,9,R,2009-06-25 00:14:00,2009-06-25,00:14:00,0,200909030511,1800 BLOCK WYLIE ST,2600.0,All Other Offenses,8.0,2009-06,-75.16635,39.969532


In [4]:
# convert timestamps to datetime
df["Dispatch_Date_Time"] = pd.to_datetime(df["Dispatch_Date_Time"])
df["Dispatch_Date"] = pd.to_datetime(df["Dispatch_Date"])

In [5]:
# sort the data by time and district
df = df.sort_values(by=["Dispatch_Date_Time", "Dc_Dist"], ascending=True).reset_index(drop=True)

In [6]:
# count how many crimes happened by day and district
df = df.groupby(["Dispatch_Date", "Dc_Dist"]).agg({"Dispatch_Date": "count"})
df.columns = ["Crimes"]
df = df.reset_index()

In [7]:
df.head()

Unnamed: 0,Dispatch_Date,Dc_Dist,Crimes
0,2006-01-01,1,14
1,2006-01-01,2,22
2,2006-01-01,3,37
3,2006-01-01,4,32
4,2006-01-01,5,12


In [8]:
# get the days and places when there was no crime
# create a grid of all day and district combinations
days = pd.unique(df["Dispatch_Date"]).astype(str)
districts = pd.unique(df["Dc_Dist"]).astype(str)
grid = np.array(np.meshgrid(days, districts)).reshape(2, len(days) * len(districts)).T
grid = pd.DataFrame(grid, columns=["Dispatch_Date", "Dc_Dist"])
grid["Dispatch_Date"] = pd.to_datetime(grid["Dispatch_Date"])
grid["Dc_Dist"] = grid["Dc_Dist"].astype(int)

# join the crime rate onto the grid
df = grid.merge(right=df, how="left", on=["Dispatch_Date", "Dc_Dist"])

# replace missing values with 0
df = df.fillna(0)

In [9]:
# extract time features
df["Year"] = df["Dispatch_Date"].dt.isocalendar().year
df["Month"] = df["Dispatch_Date"].dt.month
df["Week"] = df["Dispatch_Date"].dt.isocalendar().week
df["Year_Week"] = df["Year"].astype(str) + "_" + df["Week"].astype(str)

In [10]:
# get the weekly crime rate
df = df.groupby(["Year_Week", "Dc_Dist"]).agg({"Crimes": "sum", "Year": "min", "Month": "min", "Week": "min", "Dispatch_Date": "count"})
df.columns = ["Crimes", "Year", "Month", "Week", "Days"]
df = df.reset_index()

In [11]:
# sort by time and district
df = df.sort_values(by=["Year", "Week", "Dc_Dist"], ascending=True).reset_index(drop=True)
df = df.drop(columns="Year_Week")

In [12]:
df.head()

Unnamed: 0,Dc_Dist,Crimes,Year,Month,Week,Days
0,1,14.0,2005,1,52,1
1,2,22.0,2005,1,52,1
2,3,37.0,2005,1,52,1
3,4,32.0,2005,1,52,1
4,5,12.0,2005,1,52,1


In [13]:
# for each district insert the previous month of crime
crimes = pd.DataFrame()
districts = pd.unique(df["Dc_Dist"])
for dist in districts:
    data = df.loc[df["Dc_Dist"] == dist].reset_index(drop=True)
    for i in range(4):
        data[f"Crimes(t-{i+1})"] = data["Crimes"].shift(i+1)
    data = data.tail(data.shape[0] - 4).reset_index(drop=True)
    crimes = pd.concat([crimes, data], axis="index").reset_index(drop=True)

In [14]:
crimes.head()

Unnamed: 0,Dc_Dist,Crimes,Year,Month,Week,Days,Crimes(t-1),Crimes(t-2),Crimes(t-3),Crimes(t-4)
0,1,97.0,2006,1,4,7,144.0,103.0,129.0,14.0
1,1,110.0,2006,1,5,7,97.0,144.0,103.0,129.0
2,1,77.0,2006,2,6,7,110.0,97.0,144.0,103.0
3,1,83.0,2006,2,7,7,77.0,110.0,97.0,144.0
4,1,110.0,2006,2,8,7,83.0,77.0,110.0,97.0


In [15]:
# transform categorical features into binary columns
categories = crimes.copy()[["Dc_Dist", "Year", "Month", "Week", "Days"]]
crimes = crimes.drop(columns=["Dc_Dist", "Year", "Month", "Week", "Days"])
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
categories = encoder.fit_transform(categories)
categories = pd.DataFrame(categories, columns=encoder.get_feature_names_out())
crimes = pd.concat([crimes, categories], axis="columns")

In [16]:
# export the data
crimes.to_csv("crime_rate.csv", index=False)