In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.preprocessing import StandardScaler
from kmodes.kmodes import KModes

In [2]:
df = pd.read_csv("../Data/cleanedSampleNoMidnight.csv",low_memory=False)

In [3]:
numerical_data = df[['Longitude', 'Latitude']]
categorical_data = df[['Day', 'Violation Code', 'Street',"Time Range"]]

In [4]:
scaler = StandardScaler()
numerical_data_scaled = scaler.fit_transform(numerical_data)

In [5]:
data_scaled = pd.concat([pd.DataFrame(numerical_data_scaled,columns=numerical_data.columns), categorical_data], axis=1)

In [6]:
# Elbow curve to find optimal K
cost = []
K = range(1,11)
for num_clusters in list(K):
    kmode = KModes(n_clusters=num_clusters, init = "random", n_init = 5, verbose=1)
    kmode.fit_predict(data_scaled)
    cost.append(kmode.cost_)
    
plt.plot(K, cost, 'bx-')
plt.xlabel('No. of clusters')
plt.ylabel('Cost')
plt.title('Elbow Method For Optimal k')
plt.show()

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 492999.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 0, cost: 492999.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 0, cost: 492999.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 0, cost: 492999.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 0, cost: 492999.0
Best run was number 1
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 3755, cost: 474418.0
Run 1, iteration: 2/100, moves: 57, cost: 474418.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 515, cost: 487559.0
Init: initia

In [None]:
kmodes = KModes(n_clusters = 9, init = "random", n_init = 5, verbose=1)
kmodes.fit_predict(data_scaled)

In [None]:
labels = kmodes.labels_
data_scaled['Cluster'] = labels
data_scaled = data_scaled.astype({"Violation Code":"int","Cluster":"int"})

In [None]:
palette = sns.color_palette('bright', np.unique(labels).max() + 1)
sns.scatterplot(x='Longitude', y='Latitude', hue='Cluster', data=data_scaled, palette=palette)

In [None]:
Q1 = data_scaled[['Longitude', 'Latitude']].quantile(0.25)
Q3 = data_scaled[['Longitude', 'Latitude']].quantile(0.75)
IQR = Q3 - Q1

# Filter out rows where either column is outside of the IQR range
df_filtered = data_scaled[~((data_scaled[['Longitude', 'Latitude']] < (Q1 - 1.5 * IQR)) | (data_scaled[['Longitude', 'Latitude']] > (Q3 + 1.5 * IQR))).any(axis=1)]


In [None]:
palette = sns.color_palette('bright', np.unique(labels).max() + 1)
sns.scatterplot(x='Longitude', y='Latitude', hue='Cluster', data=df_filtered, palette=palette)

In [None]:
cluster_modes = data_scaled.groupby('Cluster').apply(lambda x: x.mode().iloc[0])
print(cluster_modes)

In [None]:
cluster_modes.to_csv("../Results/K-Modes_Cluster_Info.csv",index=False)

In [None]:
noLocdf = df[['Day', 'Violation Code', 'Street',"Time Range"]]
noLocdf

In [None]:
# Elbow curve to find optimal K
cost = []
K = range(1,11)
for num_clusters in list(K):
    kmode = KModes(n_clusters=num_clusters, init = "random", n_init = 5, verbose=1)
    kmode.fit_predict(noLocdf)
    cost.append(kmode.cost_)
    
plt.plot(K, cost, 'bx-')
plt.xlabel('No. of clusters')
plt.ylabel('Cost')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
kmodes = KModes(n_clusters = 9, init = "random", n_init = 5, verbose=1)
kmodes.fit_predict(noLocdf)

In [None]:
labels2 = kmodes.labels_
noLocdf['Cluster'] = labels2
noLocdf = noLocdf.astype({"Violation Code":"int","Cluster":"int"})
cluster_modes2 = noLocdf.groupby('Cluster').apply(lambda x: x.mode().iloc[0])
cluster_modes2.to_csv("../Results/K-Modes_Cluster_Info2.csv",index=False)
print(cluster_modes2)

In [None]:
graphData = pd.concat([pd.DataFrame(numerical_data_scaled,columns=numerical_data.columns), noLocdf], axis=1)
palette = sns.color_palette('bright', np.unique(labels).max() + 1)
sns.scatterplot(x='Longitude', y='Latitude', hue='Cluster', data=graphData, palette=palette)

In [None]:
Q1 = graphData[['Longitude', 'Latitude']].quantile(0.25)
Q3 = graphData[['Longitude', 'Latitude']].quantile(0.75)
IQR = Q3 - Q1

# Filter out rows where either column is outside of the IQR range
df_filtered2 = graphData[~((graphData[['Longitude', 'Latitude']] < (Q1 - 1.5 * IQR)) | (graphData[['Longitude', 'Latitude']] > (Q3 + 1.5 * IQR))).any(axis=1)]


In [None]:
palette = sns.color_palette('bright', np.unique(labels).max() + 1)
sns.scatterplot(x='Longitude', y='Latitude', hue='Cluster', data=df_filtered2, palette=palette)