## input: sample_df

In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing

In [24]:
# sample input dataframe
sample_df = # your input dataframe


sample_df.loc[sample_df['DayOfWeek'] == 'Saturday', 'DayOfWeek'] = 1
sample_df.loc[sample_df['DayOfWeek'] == 'Friday', 'DayOfWeek'] = 1
sample_df.loc[sample_df['DayOfWeek'] != 1, 'DayOfWeek'] = 0
sample_df

Unnamed: 0,DayOfWeek,Time,X,Y
0,0,20,-122.409530,37.765718
1,0,18,-122.495225,37.753412
2,0,21,-122.475647,37.728528
3,0,13,-122.412652,37.779513
4,0,13,-122.466205,37.772541
...,...,...,...,...
294577,0,23,-122.404360,37.788273
294578,0,17,-122.422536,37.778796
294579,0,1,-122.405863,37.798023
294580,1,20,-122.403305,37.792728


In [25]:
df = pd.read_csv('processed_data.csv', index_col=0)
df = df.reset_index(drop=True)
geo_df = df[['X', 'Y']]
geo_df

Unnamed: 0,X,Y
0,-122.409530,37.765718
1,-122.495225,37.753412
2,-122.475647,37.728528
3,-122.412652,37.779513
4,-122.466205,37.772541
...,...,...
294577,-122.404360,37.788273
294578,-122.422536,37.778796
294579,-122.405863,37.798023
294580,-122.403305,37.792728


In [4]:
# standardize
from sklearn.preprocessing import StandardScaler
s_geo_df = pd.DataFrame(StandardScaler().fit_transform(geo_df), columns=['X', 'Y'])
s_geo_df

Unnamed: 0,X,Y
0,0.539828,-0.194172
1,-2.765332,-0.726746
2,-2.010248,-1.803607
3,0.419417,0.402803
4,-1.646050,0.101059
...,...,...
294577,0.739242,0.781875
294578,0.038180,0.371762
294579,0.681249,1.203820
294580,0.779917,0.974665


In [26]:
# final model with scalar data
kmeans = KMeans(n_clusters = 6, init = 'k-means++', random_state = 1024)
model = kmeans.fit(s_geo_df)

In [27]:
# test_df: the dataframe from frontend
test_df = sample_df[['X', 'Y']]
s_test_df = pd.DataFrame(StandardScaler().fit_transform(test_df), columns=['X', 'Y'])

In [28]:
cluster = model.predict(s_test_df)
cluster

array([2, 5, 0, ..., 1, 1, 2])

In [29]:
tmp_df = pd.DataFrame(cluster, columns=['cluster'])
result_df = pd.concat([sample_df,tmp_df], axis=1)
result_df

Unnamed: 0,DayOfWeek,Time,X,Y,cluster
0,0,20,-122.409530,37.765718,2
1,0,18,-122.495225,37.753412,5
2,0,21,-122.475647,37.728528,0
3,0,13,-122.412652,37.779513,1
4,0,13,-122.466205,37.772541,5
...,...,...,...,...,...
294577,0,23,-122.404360,37.788273,1
294578,0,17,-122.422536,37.778796,1
294579,0,1,-122.405863,37.798023,1
294580,1,20,-122.403305,37.792728,1


In [38]:
test_df = result_df[['DayOfWeek', 'Time', 'cluster']]
test_df

Unnamed: 0,DayOfWeek,Time,cluster
0,0,20,2
1,0,18,5
2,0,21,0
3,0,13,1
4,0,13,5
...,...,...,...
294577,0,23,1
294578,0,17,1
294579,0,1,1
294580,1,20,1


In [34]:
df = pd.read_csv('clustered_data.csv', index_col=0) # you also need this dataframe(is in github main)
df = df[['DayOfWeek', 'Time', 'crime_weight', 'cluster']]
df[['cluster']] = df[['cluster']].astype(str)
df.loc[df['DayOfWeek'] == 'Saturday', 'DayOfWeek'] = 1
df.loc[df['DayOfWeek'] == 'Friday', 'DayOfWeek'] = 1
df.loc[df['DayOfWeek'] != 1, 'DayOfWeek'] = 0


Unnamed: 0,DayOfWeek,Time,crime_weight,cluster
0,0,20,13,2
1,0,18,12,5
2,0,21,12,0
3,0,13,11,1
4,0,13,12,5
...,...,...,...,...
294577,0,23,11,1
294578,0,17,13,1
294579,0,1,11,1
294580,1,20,17,1


In [36]:
X_df, y_df = df.copy().drop(columns='crime_weight'), df['crime_weight']

In [39]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(criterion = 'squared_error', min_samples_split = 2, random_state = 1024)
model.fit(X_df, y_df)
y_test_pred = model.predict(test_df)

array([10.96797954, 10.75227171, 10.68136845, ..., 10.89091615,
       10.72662604, 10.59534703])

In [40]:
tmp_df = pd.DataFrame(y_test_pred, columns=['crime_weight'])
result_df = pd.concat([result_df,tmp_df], axis=1)
result_df

Unnamed: 0,DayOfWeek,Time,X,Y,cluster,crime_weight
0,0,20,-122.409530,37.765718,2,10.967980
1,0,18,-122.495225,37.753412,5,10.752272
2,0,21,-122.475647,37.728528,0,10.681368
3,0,13,-122.412652,37.779513,1,10.366853
4,0,13,-122.466205,37.772541,5,10.621858
...,...,...,...,...,...,...
294577,0,23,-122.404360,37.788273,1,10.426355
294578,0,17,-122.422536,37.778796,1,10.475791
294579,0,1,-122.405863,37.798023,1,10.890916
294580,1,20,-122.403305,37.792728,1,10.726626


## output: result_df