## Install Libraries


In [None]:
!pip install scikit-learn-extra # For K_medoids
!pip install -U imbalanced-learn # For Oversampling and Undersampling

## Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
data = pd.read_csv("../data/preprocessed_data.csv")

In [3]:
data.info

<bound method DataFrame.info of         Category  DayOfWeek  PdDistrict         X         Y       Day  \
0             37  -0.503387           4 -0.107902  0.007832 -0.292682   
1             21  -0.503387           4 -0.107902  0.007832 -0.292682   
2             21  -0.503387           4 -0.057541  0.064335 -0.292682   
3             16  -0.503387           4 -0.144262  0.065338 -0.292682   
4             16  -0.503387           5 -0.531112  0.001140 -0.292682   
...          ...        ...         ...       ...       ...       ...   
878044        25  -1.517575           8 -1.199747 -0.124677 -1.089676   
878045        16  -1.517575           2 -0.815292 -0.085518 -1.089676   
878046        16  -1.517575           7  0.633404  0.020235 -1.089676   
878047        35  -1.517575           7  1.057042  0.020983 -1.089676   
878048        12  -1.517575           0  0.912272 -0.071808 -1.089676   

           Month     Year      Hour    Minute     Block  
0      -0.418933  1.73165  1.4638

# KMeans
#### KMeans is an unsupervised algorithm, so we cannot apply GridSearchCV directly 
#### since it requires labeled data (supervised evaluation).

In [4]:
def cluster_data(df, features=None, k_range=range(2, 11), max_samples=10000, random_state=42, verbose=True):
    
    
    #data = df[features].copy()
    
    # Sample Of Data
    sample_size = min(len(df), max_samples)
    samples = df.sample(n=sample_size, random_state=random_state)  
    
    # Search for best K
    scores = []
    labels_dict = {}
    
    # LOOP
    for k in k_range:
    
        kmedoids = KMedoids(n_clusters=k, random_state=random_state, metric='euclidean')
        labels = kmedoids.fit_predict(samples)
        labels_dict[k] = labels
        medoids_dict = {k: kmedoids}
        
        #silhouette score
        score = silhouette_score(samples, labels)
        scores.append(score)
    
    # Find optimal k (skip k=1 if it's in the range)
    best_k = k_range[scores.index(max(scores))]

    df_clustered = df.copy()

    # Add Cluster Column to Data
    df_clustered.loc[samples.index, 'Cluster'] = labels_dict[best_k] 
    
    # Get the best model
    best_kmedoids = KMedoids(n_clusters=best_k, random_state=random_state).fit(samples)
    
    if len(df_clustered) > sample_size:

        # Assign remaining points to nearest medoid center
        remaining_indices = df_clustered.index.difference(samples.index)
        remaining_data = df.loc[remaining_indices]

        # Predict clusters for remaining points
        remaining_labels = best_kmedoids.predict(remaining_data)
        df_clustered.loc[remaining_indices, 'Cluster'] = remaining_labels
    
    return df_clustered, best_k, scores

In [5]:
def analyze_clusters(df, cluster_col='Cluster'):

    features = df.select_dtypes(include='number').columns.drop(cluster_col)

    print(df[cluster_col].value_counts().sort_index())
    return df.groupby(cluster_col)[features].mean()

In [6]:
X = data.drop(columns=['Category'])  # Numerical Data
Y = data['Category']

df_clustered, best_k, scores = cluster_data(X)  


# Analyze cluster characteristics
cluster_means = analyze_clusters(df_clustered)
print(cluster_means)

df_clustered.info()

#0.0    185812
#1.0    188943
#2.0    143248
#3.0    360046

Cluster
0.0    185812
1.0    188943
2.0    143248
3.0    360046
Name: count, dtype: int64
         DayOfWeek  PdDistrict         X         Y       Day     Month  \
Cluster                                                                  
0.0      -0.068696    3.318650 -0.118703 -0.012461  0.106721 -0.456932   
1.0       0.123613    0.677501  0.611560 -0.014515  0.011297 -0.123174   
2.0      -0.041644    3.317819 -0.127199 -0.013012 -0.141354  0.805412   
3.0      -0.012848    7.452589 -0.209064  0.019225 -0.004766 -0.019990   

             Year      Hour    Minute     Block  
Cluster                                          
0.0      0.419003  0.184235  0.432122 -0.083949  
1.0     -0.155452  0.022396 -0.106612  0.021070  
2.0     -0.299146 -0.310279 -0.449927 -0.048416  
3.0     -0.015643  0.016615  0.011947  0.051530  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------ 

In [8]:
df_clustered["Category"]=data["Category"]
df_clustered.columns

Index(['DayOfWeek', 'PdDistrict', 'X', 'Y', 'Day', 'Month', 'Year', 'Hour',
       'Minute', 'Block', 'Cluster', 'Category'],
      dtype='object')

In [9]:
df_clustered.to_csv("../data/Kmedoid_data.csv", index=False)