In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('crime.csv', encoding = "ISO-8859-1")

In [None]:
df['DISTRICT_ID'] = pd.to_numeric(df['DISTRICT_ID'], errors='coerce')
df = df.dropna()

df['DISTRICT_ID'] = df['DISTRICT_ID'].astype(float)
df['GEO_LAT'] = df['GEO_LAT'].astype(float)
df['GEO_LON'] = df['GEO_LON'].astype(float)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)


In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error


features = ['GEO_LAT', 'GEO_LON', 'DISTRICT_ID']
df2 = df[df['OFFENSE_CATEGORY_ID'] .isin( ['drug-alcohol', 'public-disorder'])][features]

X = df2[features[:2]].values
y = df2[features[-1]].values
y = np.reshape(y, (df2.shape[0], 1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
df_pred = pd.DataFrame(np.column_stack((X_test, y_pred)), columns=['GEO_LAT', 'GEO_LON', 'DISTRICT_ID'])
df_test = pd.DataFrame(np.column_stack((X_test, y_test)), columns=['GEO_LAT', 'GEO_LON', 'DISTRICT_ID'])

In [None]:
cmap_pred = sns.cubehelix_palette(dark=.9, light=.1, as_cmap=True)

fig, axes = plt.subplots(2, 2, figsize=(20, 10))
plt.title('Public Disorder Occurences by District (Prediction vs Test)')

axes[0,1].title.set_text('Public Disorder Occurences by District (Prediction vs Test)')
axes[1,0].title.set_text('Public Disorder Occurences by District (Test)')
axes[1,1].title.set_text('Public Disorder Occurences by District (Prediction)')

sns.scatterplot(x='GEO_LON',
                y='GEO_LAT', 
                hue='DISTRICT_ID', 
                legend='full',
                palette='Set2',
                alpha=0.5,
                ax=axes[0,1],
                data=df_pred)

sns.scatterplot(x='GEO_LON',
                y='GEO_LAT', 
                hue='DISTRICT_ID', 
                legend='full',
                palette='Set1',
                alpha=0.5,
                ax=axes[0,1],
                data=df_test)

sns.scatterplot(x='GEO_LON',
                y='GEO_LAT', 
                hue='DISTRICT_ID', 
                legend='full',
                palette='Set2',
                ax=axes[1,0],
                data=df_pred)

sns.scatterplot(x='GEO_LON',
                y='GEO_LAT', 
                hue='DISTRICT_ID', 
                legend='full',
                palette='Set1',
                ax=axes[1,1],
                data=df_test)