# Machine Learning Model to Predict Crime in Different Part

Using the publically available police dataset at https://data.torontopolice.on.ca/pages/major-crime-indicators

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [28]:
df = pd.read_csv('C:\ml\Major_Crime_Indicators_Open_Data.csv')

In [29]:
df.head()

Unnamed: 0,X,Y,OBJECTID,EVENT_UNIQUE_ID,REPORT_DATE,OCC_DATE,REPORT_YEAR,REPORT_MONTH,REPORT_DAY,REPORT_DOY,...,UCR_CODE,UCR_EXT,OFFENCE,MCI_CATEGORY,HOOD_158,NEIGHBOURHOOD_158,HOOD_140,NEIGHBOURHOOD_140,LONG_WGS84,LAT_WGS84
0,-79.425896,43.757346,1,GO-20141262074,2014/01/01 05:00:00+00,1998/06/01 04:00:00+00,2014,January,1,1,...,1480,110,Administering Noxious Thing,Assault,38,Lansing-Westgate,38,Lansing-Westgate (38),-79.425896,43.757346
1,-79.350229,43.646293,2,GO-20141260701,2014/01/01 05:00:00+00,2014/01/01 05:00:00+00,2014,January,1,1,...,2120,200,B&E,Break and Enter,70,South Riverdale,70,South Riverdale (70),-79.350229,43.646293
2,-79.376497,43.666423,3,GO-20141260889,2014/01/01 05:00:00+00,2014/01/01 05:00:00+00,2014,January,1,1,...,1430,100,Assault,Assault,74,North St.James Town,74,North St.James Town (74),-79.376497,43.666423
3,-85.488744,0.0,4,GO-20141260973,2014/01/01 05:00:00+00,2014/01/01 05:00:00+00,2014,January,1,1,...,2130,210,Theft Over,Theft Over,NSA,NSA,NSA,NSA,-85.488744,0.0
4,-79.344839,43.678946,5,GO-20141261050,2014/01/01 05:00:00+00,2014/01/01 05:00:00+00,2014,January,1,1,...,1430,100,Assault,Assault,69,Blake-Jones,66,Danforth (66),-79.344839,43.678946


In [30]:
# Convert 'REPORT_MONTH' to numerical format
encoder = LabelEncoder()
df['REPORT_MONTH'] = encoder.fit_transform(df['REPORT_MONTH'])

features = df[['X', 'Y', 'REPORT_HOUR', 'REPORT_DAY', 'REPORT_MONTH', 'REPORT_YEAR']]

# Separate targets
target_UCR_CODE = df['UCR_CODE']
target_UCR_EXT = df['UCR_EXT']

In [31]:
# Split data into train and test sets for each target
X_train_code, X_test_code, y_train_code, y_test_code = train_test_split(features, target_UCR_CODE, test_size=0.2)
X_train_ext, X_test_ext, y_train_ext, y_test_ext = train_test_split(features, target_UCR_EXT, test_size=0.2)

# Train a RandomForest model for each target
model_UCR_CODE = RandomForestClassifier()
model_UCR_CODE.fit(X_train_code, y_train_code)

model_UCR_EXT = RandomForestClassifier()
model_UCR_EXT.fit(X_train_ext, y_train_ext)

In [32]:
# Evaluate the models
predictions_UCR_CODE = model_UCR_CODE.predict(X_test_code)
predictions_UCR_EXT = model_UCR_EXT.predict(X_test_ext)

print("Classification report for UCR_CODE:")
print(classification_report(y_test_code, predictions_UCR_CODE))

print("\nClassification report for UCR_EXT:")
print(classification_report(y_test_ext, predictions_UCR_EXT))

Classification report for UCR_CODE:
              precision    recall  f1-score   support

        1410       0.20      0.12      0.15       497
        1420       0.40      0.22      0.29      7384
        1430       0.46      0.73      0.57     23749
        1440       0.00      0.00      0.00         2
        1450       0.69      0.26      0.38       752
        1455       0.40      0.12      0.19        48
        1457       0.23      0.11      0.15       234
        1460       0.53      0.31      0.39      1114
        1461       0.49      0.43      0.46       115
        1462       0.50      0.44      0.47         9
        1470       0.56      0.23      0.32        22
        1475       0.00      0.00      0.00         1
        1480       0.41      0.19      0.26       727
        1610       0.64      0.35      0.45      6232
        1611       0.00      0.00      0.00         2
        2120       0.44      0.35      0.39     12422
        2121       1.00      0.25      0.40  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
import geopandas as gpd
import matplotlib.pyplot as plt

# Load the data with geographic coordinates (X, Y) and the predictions
# Create a copy of the test data with corresponding predictions
data = X_test_code.copy()
data['UCR_CODE'] = y_test_code
data['UCR_EXT'] = y_test_ext
data['Predicted_UCR_CODE'] = predictions_UCR_CODE
data['Predicted_UCR_EXT'] = predictions_UCR_EXT

# Create a GeoDataFrame from the data
geometry = gpd.points_from_xy(data['X'], data['Y'])
gdf = gpd.GeoDataFrame(data, geometry=geometry)

# Load the shapefile or any other spatial data that contains the boundaries
shapefile_path = "C:\ml\Major_Crime_Indicators_Open_Data.shp"
map_df = gpd.read_file(shapefile_path)

# Perform a spatial join to associate each crime point with its corresponding boundary
joined_df = gpd.sjoin(gdf, map_df, how='left', op='within')

# Group the data by the boundaries and calculate the count of crimes
grouped = joined_df.groupby(['HOOD_158', 'NEIGHBOURHOOD_158', 'geometry']).size().reset_index(name='CrimeCount')

# Merge the crime counts with the map data
merged = map_df.merge(grouped, on=['HOOD_158', 'NEIGHBOURHOOD_158', 'geometry'], how='left')

# Fill missing values with zero
merged['CrimeCount'].fillna(0, inplace=True)

# Plot the heatmap
fig, ax = plt.subplots(figsize=(12, 8))
merged.plot(column='CrimeCount', cmap='hot_r', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)

# Set the title and axis labels
plt.title('Crime Heatmap')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

# Show the plot
plt.show()


  if await self.run_code(code, result, async_=asy):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  joined_df = gpd.sjoin(gdf, map_df, how='left', op='within')


KeyError: 'NEIGHBOURHOOD_158'

In [35]:
import pandas as pd
import folium
from folium.plugins import HeatMap

# Load the data with geographic coordinates (X, Y)
data = X_test_code.copy()

# Create a folium map centered on Toronto
toronto_map = folium.Map(location=[43.651070, -79.347015], zoom_start=11)

# Filter out rows with missing latitude or longitude values
data = data.dropna(subset=['LAT_WGS84', 'LONG_WGS84'])

# Create a HeatMap layer using the latitude and longitude columns
heat_data = [[row['LAT_WGS84'], row['LONG_WGS84']] for index, row in data.iterrows()]
HeatMap(heat_data).add_to(toronto_map)

# Display the map
toronto_map


KeyError: ['LAT_WGS84', 'LONG_WGS84']