In [2]:
import pandas as pd
import numpy as np
import re

# 

In [20]:
crime = pd.read_csv('data/major_crime.csv', index_col='OBJECTID')
crime.head()

Unnamed: 0_level_0,X,Y,EVENT_UNIQUE_ID,REPORT_DATE,OCC_DATE,REPORT_YEAR,REPORT_MONTH,REPORT_DAY,REPORT_DOY,REPORT_DOW,...,UCR_CODE,UCR_EXT,OFFENCE,MCI_CATEGORY,HOOD_158,NEIGHBOURHOOD_158,HOOD_140,NEIGHBOURHOOD_140,LONG_WGS84,LAT_WGS84
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-8809036.0,5431523.0,GO-20141260264,2014/01/01 05:00:00+00,2014/01/01 05:00:00+00,2014,January,1,1,Wednesday,...,1430,100,Assault,Assault,143,West Rouge,131,Rouge (131),-79.132915,43.780413
2,-8814320.0,5435514.0,GO-20141260033,2014/01/01 05:00:00+00,2013/12/31 05:00:00+00,2014,January,1,1,Wednesday,...,1430,100,Assault,Assault,144,Morningside Heights,131,Rouge (131),-79.180387,43.806289
3,-8832825.0,5419631.0,GO-20141259834,2014/01/01 05:00:00+00,2014/01/01 05:00:00+00,2014,January,1,1,Wednesday,...,1420,100,Assault With Weapon,Assault,55,Thorncliffe Park,55,Thorncliffe Park (55),-79.346615,43.703234
4,-8847292.0,5429042.0,GO-20141264084,2014/01/01 05:00:00+00,2013/12/31 05:00:00+00,2014,January,1,1,Wednesday,...,1420,110,Assault Bodily Harm,Assault,27,York University Heights,27,York University Heights (27),-79.476579,43.764317
5,6.32778e-09,5.664924e-09,GO-20141260577,2014/01/01 05:00:00+00,2014/01/01 05:00:00+00,2014,January,1,1,Wednesday,...,1610,180,Robbery - Swarming,Robbery,NSA,NSA,NSA,NSA,0.0,0.0


In [21]:
crime.columns

Index(['X', 'Y', 'EVENT_UNIQUE_ID', 'REPORT_DATE', 'OCC_DATE', 'REPORT_YEAR',
       'REPORT_MONTH', 'REPORT_DAY', 'REPORT_DOY', 'REPORT_DOW', 'REPORT_HOUR',
       'OCC_YEAR', 'OCC_MONTH', 'OCC_DAY', 'OCC_DOY', 'OCC_DOW', 'OCC_HOUR',
       'DIVISION', 'LOCATION_TYPE', 'PREMISES_TYPE', 'UCR_CODE', 'UCR_EXT',
       'OFFENCE', 'MCI_CATEGORY', 'HOOD_158', 'NEIGHBOURHOOD_158', 'HOOD_140',
       'NEIGHBOURHOOD_140', 'LONG_WGS84', 'LAT_WGS84'],
      dtype='object')

In [22]:
# The OFFENCE column has very specific descriptions of the crimes, we are only interested in the larger category
# As for the report date columns, we are interested in when it happened - not when it was reported.
crime.drop(columns=['REPORT_DATE', 'REPORT_YEAR', 'REPORT_MONTH', 'REPORT_DAY', 'REPORT_DOY', 'REPORT_DOW', 'REPORT_HOUR', 'OFFENCE', 'HOOD_158', 'HOOD_140'], inplace=True)

In [23]:
# Create a new DataFrame with the unique neighbourhood158 values and their corresponding updated names
neighbourhood_mapping = crime.groupby('NEIGHBOURHOOD_158')['NEIGHBOURHOOD_140'].unique().reset_index()

new_neighbourhood_df = pd.DataFrame({
    'past_neighbourhood_158': neighbourhood_mapping['NEIGHBOURHOOD_158'],
    'neighbourhood': neighbourhood_mapping['NEIGHBOURHOOD_140'].apply(lambda x: x[0]) # assuming there's only one updated name
})

new_neighbourhood_df

Unnamed: 0,past_neighbourhood_158,neighbourhood
0,Agincourt North,Agincourt North (129)
1,Agincourt South-Malvern West,Agincourt South-Malvern West (128)
2,Alderwood,Alderwood (20)
3,Annex,Annex (95)
4,Avondale,Willowdale East (51)
...,...,...
154,Yonge-Doris,Willowdale East (51)
155,Yonge-Eglinton,Yonge-Eglinton (100)
156,Yonge-St.Clair,Yonge-St.Clair (97)
157,York University Heights,York University Heights (27)


In [24]:
# Remove the number and brackets at the end of the neighbourhood name
def clean_string(text):
    pattern = r'\s*\(\d+\)$'
    cleaned = re.sub(pattern, '', text)

    return cleaned

In [25]:
new_neighbourhood_df['neighbourhood'] = new_neighbourhood_df['neighbourhood'].apply(clean_string)

In [26]:
crime['NEIGHBOURHOOD_140'] = crime['NEIGHBOURHOOD_140'].apply(clean_string)

In [27]:
# Now we can drop the old neighbourhood_158 column
crime.drop(columns=['NEIGHBOURHOOD_158'], inplace=True)

In [28]:
crime.columns

Index(['X', 'Y', 'EVENT_UNIQUE_ID', 'OCC_DATE', 'OCC_YEAR', 'OCC_MONTH',
       'OCC_DAY', 'OCC_DOY', 'OCC_DOW', 'OCC_HOUR', 'DIVISION',
       'LOCATION_TYPE', 'PREMISES_TYPE', 'UCR_CODE', 'UCR_EXT', 'MCI_CATEGORY',
       'NEIGHBOURHOOD_140', 'LONG_WGS84', 'LAT_WGS84'],
      dtype='object')

In [29]:
# Converting the occurence data to a datetime variable for easy filtering
crime['OCC_DATE'] = pd.to_datetime(crime['OCC_DATE'], format='%Y/%m/%d %H:%M:%S+00')

In [30]:
# Filtering out crimes that happened before 01/01/2013 and after 31/12/2023
start_date = pd.to_datetime('2013-01-01')
end_date = pd.to_datetime('2023-12-31')

crime = crime[(crime['OCC_DATE'] >= start_date) & (crime['OCC_DATE'] <= end_date)]

In [31]:
crime = crime.sort_values(by='OCC_DATE')

In [33]:
# Create separate data frames for each year
crime_by_date = crime.groupby(crime['OCC_DATE'].dt.year)
yearly_crime_dfs = {year: group for year, group in crime_by_date}
# access each year by -> yearly_crime_dfs[x] where x is 2013, 2014, ..., 2023

In [34]:
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

In [18]:
monthly_summaries = {}
crime_types = ['Assault', 'Break and Enter', 'Theft Over', 'Robbery', 'Auto Theft']

# Iterate through each yearly DataFrame
for year, df_yearly in yearly_crime_dfs.items():
    df_yearly['MONTH'] = df_yearly['OCC_DATE'].dt.month
    
    # Group the yearly DataFrame by month and by crime category, then count the number of crimes
    monthly_summary_yearly = df_yearly.groupby(['MONTH', 'MCI_CATEGORY']).size().unstack(fill_value=0).reset_index()
    
    # Add a 'YEAR' column to the monthly summary DataFrame
    monthly_summary_yearly['YEAR'] = year
    monthly_summary_yearly = monthly_summary_yearly[['YEAR', 'MONTH'] + crime_types]
    
    monthly_summaries[year] = monthly_summary_yearly.to_dict(orient='records')

In [106]:
# crimes by neighbourhood
yearly_neighborhood_summaries = {}

for year, df_yearly in yearly_crime_dfs.items():
    neighborhood_summary = df_yearly['NEIGHBOURHOOD_140'].value_counts().reset_index()
    neighborhood_summary.columns = ['NEIGHBOURHOOD_140', 'num_crimes']
    
    yearly_neighborhood_summaries[year] = neighborhood_summary

In [108]:
# save as .csv
for year in years:
    yearly_crime_dfs[year].to_csv('data/crime/by_year/all_crimes' + str(year) + '.csv')
    pd.DataFrame(monthly_summaries[year]).to_csv('data/crime/by_year_month/crime_summary' + str(year) + '.csv')
    yearly_neighborhood_summaries[year].to_csv('data/crime/by_neighbourhood/' + str(year) + 'crimes_per_neighbourhood.csv')

In [35]:
crime.to_csv('data/crime/major_crimes.csv')
new_neighbourhood_df.to_csv('data/neighbourhoods.csv')