In [1]:
# Imports
import pandas as pd
import seaborn as sns


In [2]:
# Read data
crimes = pd.read_csv(
    "https://www1.nyc.gov/assets/nypd/downloads/excel/crime_statistics/cs-en-us-city.xlsx")
crimes.head(2)

ParserError: Error tokenizing data. C error: Expected 3 fields in line 21, saw 4


In [None]:
# Drop NaN columns
crimes = crimes[["Unnamed: 0", 2020, 2019, "% Chg"]]
crimes = crimes.dropna()

In [None]:
# Rename columns
crimes = crimes.rename(columns={"Unnamed: 0": "Crime"})

In [None]:
# Set index
crimes = crimes.set_index(crimes["Crime"])
crimes = crimes.drop("Crime", axis=1)

In [None]:
# Drop total
crimes = crimes.drop("TOTAL")

In [None]:
crimes.head(2)

## Bar Plot

In [None]:
import plotly.graph_objects as go

In [None]:
crimes_plot = crimes.reset_index()
graph = crimes_plot.drop(['% Chg'], axis=1)

In [None]:
fig = go.Figure([
    go.Bar(name='2019', x=graph['Crime'], y=crimes[2019]),
    go.Bar(name='2020', x=graph['Crime'], y=crimes[2020])
])

fig.update_layout(title='Crime in NYC (2019 vs 2020)',
                  xaxis=dict({'categoryorder': 'total ascending'},
                             title='Crime Committed'),
                  yaxis=dict(title='Number of Crimes'))

fig.show()

## Histogram

In [None]:
%matplotlib inline
import geopandas as gp
import numpy as np
from scipy import ndimage

import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
pylab.rcParams['figure.figsize'] = 8, 6

In [None]:
file = "../Data/geo_export_12ba89da-ef97-439e-b420-f1f076a1c0a4.shp"
nyc = gp.read_file(file)
nyc.head(2)

In [None]:
def heatmap(d, bins=(500,500), smoothing=1.3, cmap='jet'):
    def getx(pt):
        return pt.coords[0][0]

    def gety(pt):
        return pt.coords[0][1]

    x = list(d.geometry.apply(getx))
    y = list(d.geometry.apply(gety))
    heatmap, xedges, yedges = np.histogram2d(y, x, bins=bins)
    extent = [yedges[0], yedges[-1], xedges[-1], xedges[0]]

    logheatmap = np.log(heatmap)
    logheatmap[np.isneginf(logheatmap)] = 0
    logheatmap = ndimage.filters.gaussian_filter(logheatmap, smoothing, mode='nearest')
    
    plt.imshow(logheatmap, cmap=cmap, extent=extent)
    plt.colorbar()
    plt.gca().invert_yaxis()
    plt.show()

In [None]:
heatmap(nyc,
        bins=400)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

In [None]:
X = nyc.drop("Petite Larceny")
y = nyc["Petite Larceny"]

In [None]:
rfc = RandomForestClassifier()

params = {
    "n_estimators": [100, 1000, 1500],
    "criterion": ['gini', 'entropy'],
    "max_depth": [1, 5, 10],
    "max_features": ['auto', 'sqrt', 'log2'],
    "n_jobs": [-1],
    "random_state": [1]
}

In [None]:

grid = GridSearchCV(rfc, params).fit(X_train, y_train)
grid_df = pd.DataFrame(grid.cv_results_)

In [None]:
best_model = grid_df.loc[:"mean_test_score"].max()
best_model

In [None]:
best_rfc = RandomForestClassifier(
    n_estimators = best_model.param_n_estimators,
    criterion = best_model.param_criterion,
    max_depth = best_model.param_max_depth,
    max_features = best_model.param_max_features,
    n_jobs = best_model.param_n_jobs,
    random_state = best_model.param_random_state,
).fit(X_train, y_train)


In [None]:
cm = confusion_matrix(y, best_rfc.predict(X))

sns.heatmap(cm, annot =True, fmt = 'd');


In [None]:
print(classification_report(y, best_rfc.predict(X)))


