In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import datetime

import geopandas as gpd

from sklearn.cluster import KMeans

In [None]:
# Reading datasets from two different time periods

# https://data.lacity.org/Public-Safety/Crime-Data-from-2010-to-2019/63jg-8b9z
past_df = pd.read_csv("Crime_Data_from_2010_to_2019.csv", dtype = {"TIME OCC": str})
# https://data.lacity.org/Public-Safety/Crime-Data-from-2020-to-Present/2nrs-mtv8
present_df = pd.read_csv("Crime_Data_from_2020_to_Present.csv", dtype = {"TIME OCC": str})

LAPD_df = pd.read_csv("LAPD_Police_Stations.csv")

# past dataset has column name typo
past_df = past_df.rename(columns={"AREA ": "AREA"})

# Both datasets use the same columns 
df = pd.concat([past_df, present_df])

In [None]:
df = df.drop(columns=[
    "Crm Cd", "Crm Cd 1", "Crm Cd 2", "Crm Cd 3", 
    "Crm Cd 4", "Premis Cd", "Weapon Used Cd", 
    "Mocodes", "Part 1-2", "Status", "Date Rptd"
    ])

# Changed code to be readable description
df["Vict Descent"] = df["Vict Descent"].replace({
                            "A": "Other Asian", 
                            "B": "Black", 
                            "C": "Chinese", 
                            "D": "Cambodian", 
                            "F": "Filipino", 
                            "G": "Guamanian", 
                            "H": "Hispanic/Latin/Mexican", 
                            "I": "American Indian/Alaskan Native", 
                            "J": "Japanese", 
                            "K": "Korean", 
                            "L": "Laotian", 
                            "O": "Other", 
                            "P": "Pacific Islander", 
                            "S": "Samoan", 
                            "U": "Hawaiian", 
                            "V": "Vietnamese", 
                            "W": "White", 
                            "X": "Unknown", 
                            "Z": "Asian Indian",
                        })

In [None]:
LAPD_df = LAPD_df.rename(columns={
    "X": "Longitude", 
    "Y": "Latitude"
    })

# Top 5 Crimes Per Area (EDA Inclusion)

We also would like to explore the top crimes in the city, to see if there appears to be any patterns in what the crimes are and where they may possibly be occuring.

To do this we made a list of the top 5 most common crimes in the city, and filtered our df_plot frame to only include crimes that fell into those crime descriptions.

In [None]:
df_plot = df.loc[(df['LAT'] >= 33.7) & (df['LAT'] <= 34.37) & (df['LON'] >= -118.7) & (df['LON'] <= -118.15)]
top_5 = pd.Series(df_plot['Crm Cd Desc'].value_counts()[:5].index)
top_5_list = np.array(df_plot['Crm Cd Desc'].value_counts()[:5].index)
top_5_list

In [None]:
top_5_plot = df_plot[df_plot['Crm Cd Desc'].isin(top_5_list)]
top_5_plot.head()

In order to color our map based on different crimes, we needed to change the label of the crimes from categorical to numeric.

We did this by creating the function label_change which simply outputted an integer based on which crime was committed. We then added a new column to our top_5_plot dataframe by transforming our crime description column with label_change.

In [None]:
def label_change(string):
    if string == top_5[0]:
        return 1
    elif string == top_5[1]:
        return 2
    elif string == top_5[2]:
        return 3
    elif string == top_5[3]:
        return 4
    else:
        return 5

In [None]:
top_5_plot['Num Crime'] = top_5_plot['Crm Cd Desc'].apply(label_change)

The resulting map plots the occurances of the top 5 most common crimes in the dataset, with quite the color variation throughout the city.

In [None]:
map_df = gpd.read_file('LAPD_Divisions.shp')

fig, ax = plt.subplots(1, figsize = (20, 12))

map_df.boundary.plot(color='black', ax=ax)
top_5_plot.plot(ax=ax, x='LON', y='LAT', kind='scatter', c='Num Crime',  s=0.1, alpha=0.5, cmap='jet')

We see from the plot that the most common crimes of Simple Assault/Battery and Stolen Vehicles appears to occur in the South East and Northern parts of Los Angeles, notably more inland than the other crimes. Burglary seems to occur towards the coast and outskirts of districts, towards the more Western side of the city. 

# Serious Crimes - include this in analysis

After using k means to analyze the reach of police stations using the whole data set, we decided that a good approach would be to narrow our crimes. Looking at crimes involving gun violence and explosive devices, we belived that our k means clusters could create more accurate centroids when looking at a more specific set of data.

In order to filter by weapon type, we created a new dataframe from the original data and dropped any null values that appeared in that particular column. Let's take a look at the unique values that remain in the frame.

In [None]:
df_crime = pd.concat([past_df, present_df])
weapons = df_crime.dropna(subset=['Weapon Desc'])
weapons['Weapon Desc'].unique()

Now that we have a better idea of what weapons appear in the dataset, we can additionally filter by the particular firearms and explosive devices.

In [None]:
severe = weapons[(weapons['Weapon Desc'].str.contains('RIFLE')) | 
        (weapons['Weapon Desc'].str.contains('PISTOL')) | 
       (weapons['Weapon Desc'].str.contains('ASSAULT RIFLE')) |
       (weapons['Weapon Desc'].str.contains('BOMB')) |
        (weapons['Weapon Desc'].str.contains('FIREARM')) |
        (weapons['Weapon Desc'].str.contains('SHOTGUN')) |
        (weapons['Weapon Desc'].str.contains('SEMIAUTOMATIC')) |
        (weapons['Weapon Desc'].str.contains('EXPLOSIVE'))]
severe = severe.loc[(severe['LAT'] >= 33.7) & (severe['LAT'] <= 34.37) & (severe['LON'] >= -118.7) & (severe['LON'] <= -118.15)]
severe.head()['Weapon Desc']

We can now begin our analysis using k means clustering on data that only contains our specified weapons.

In [None]:
kmeans = KMeans(n_clusters=21)
kmeans.fit(severe[["LON", "LAT"]])

We set up our map the same as before and consider the centroid locations when compared to the actual placement of police stations. We have kept the stations as blue points but now the centroids are red, and have noticably moved from their previous locations.

In [None]:
centers_df = pd.DataFrame(kmeans.cluster_centers_, columns=['Longitude', 'Latitude'])

map_df = gpd.read_file('LAPD_Divisions.shp')

fig, ax = plt.subplots(1, figsize = (18, 8))
map_df.boundary.plot(color='black', ax=ax)


# labeling each crime to a cluster
severe["cluster"] = kmeans.predict(severe[["LON", "LAT"]])
severe.plot(ax=ax, x='LON', y='LAT', kind='scatter', c='cluster',  s=0.1, alpha=0.5, cmap='viridis')


LAPD_df.plot(ax=ax, x='Longitude', y='Latitude', kind='scatter', s=10, c='red')
centers_df.plot(ax=ax, x='Longitude', y='Latitude', kind='scatter', s=10, c='blue') 

We can now create bar plots again to compare the amount of severe crimes within a centroid vs the current police stations. We observe that the plots or more similar than in our earlier analysis which considered all crimes in Los Angeles.

In [None]:
plt.figure(1, figsize=(18,10))
plt.subplot(211)
severe['cluster'].value_counts().plot(kind = 'barh', color = 'red', label = 'Centroids')
plt.legend()
plt.subplot(212)
severe['AREA NAME'].value_counts().plot(kind = 'barh', color = 'blue', label = 'Police Stations')
plt.legend()

Earlier we discovered that crime appeared to be somewhat evenly distributed among the 21 different police stations in Los Angeles, with our k means centroids creating a less balanced distribution. However, when considering crimes involving gun violence and explosives, our new centroids contain more even data than the actual police stations.


# End of Analysis - Stop Here