In [1]:
!pip install scikit-learn-extra # For K_medoids
!pip install -U imbalanced-learn # For Oversampling and Undersampling

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
   ---------------------------------------- 0.0/238.4 kB ? eta -:--:--
   ---------- ----------------------------- 61.4/238.4 kB 1.7 MB/s eta 0:00:01
   ---------------------------------- ----- 204.8/238.4 kB 2.5 MB/s eta 0:00:01
   ---------------------------------------- 238.4/238.4 kB 2.1 MB/s eta 0:00:00
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.12.3
    Uninstalling imbalanced-learn-0.12.3:
      Successfully uninstalled imbalanced-learn-0.12.3
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3


## Import Libraries

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import pandas as pd
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
data = pd.read_csv("../data/preprocessed_data.csv")

In [3]:
data.info

<bound method DataFrame.info of         Category  DayOfWeek  PdDistrict         X         Y       Day  \
0             37  -0.503387           4 -0.107902  0.007832 -0.292682   
1             21  -0.503387           4 -0.107902  0.007832 -0.292682   
2             21  -0.503387           4 -0.057541  0.064335 -0.292682   
3             16  -0.503387           4 -0.144262  0.065338 -0.292682   
4             16  -0.503387           5 -0.531112  0.001140 -0.292682   
...          ...        ...         ...       ...       ...       ...   
878044        25  -1.517575           8 -1.199747 -0.124677 -1.089676   
878045        16  -1.517575           2 -0.815292 -0.085518 -1.089676   
878046        16  -1.517575           7  0.633404  0.020235 -1.089676   
878047        35  -1.517575           7  1.057042  0.020983 -1.089676   
878048        12  -1.517575           0  0.912272 -0.071808 -1.089676   

           Month     Year      Hour    Minute     Block  
0      -0.418933  1.73165  1.4638

# KMeans
#### KMeans is an unsupervised algorithm, so we cannot apply GridSearchCV directly 
#### since it requires labeled data (supervised evaluation).

In [5]:
def cluster_data(df, features=None, k_range=range(2, 11), max_samples=10000, random_state=42, verbose=True):
    
    
    #data = df[features].copy()
    
    # Sample Of Data
    sample_size = min(len(df), max_samples)
    samples = df.sample(n=sample_size, random_state=random_state)  
    
    # Search for best K
    scores = []
    labels_dict = {}
    
    # LOOP
    for k in k_range:
    
        kmedoids = KMedoids(n_clusters=k, random_state=random_state, metric='euclidean')
        labels = kmedoids.fit_predict(samples)
        labels_dict[k] = labels
        medoids_dict = {k: kmedoids}
        
        #silhouette score
        score = silhouette_score(samples, labels)
        scores.append(score)
    
    # Find optimal k (skip k=1 if it's in the range)
    best_k = k_range[scores.index(max(scores))]

    df_clustered = df.copy()

    # Add Cluster Column to Data
    df_clustered.loc[samples.index, 'Cluster'] = labels_dict[best_k] 
    
    # Get the best model
    best_kmedoids = KMedoids(n_clusters=best_k, random_state=random_state).fit(samples)
    
    if len(df_clustered) > sample_size:

        # Assign remaining points to nearest medoid center
        remaining_indices = df_clustered.index.difference(samples.index)
        remaining_data = df.loc[remaining_indices]

        # Predict clusters for remaining points
        remaining_labels = best_kmedoids.predict(remaining_data)
        df_clustered.loc[remaining_indices, 'Cluster'] = remaining_labels
    
    return df_clustered, best_k, scores

In [6]:
def analyze_clusters(df, cluster_col='Cluster'):

    features = df.select_dtypes(include='number').columns.drop(cluster_col)

    print(df[cluster_col].value_counts().sort_index())
    return df.groupby(cluster_col)[features].mean()

In [7]:
X = data.drop(columns=['Category'])  # Numerical Data
Y = data['Category']

df_clustered, best_k, scores = cluster_data(X)  


# Analyze cluster characteristics
cluster_means = analyze_clusters(df_clustered)
print(cluster_means)

df_clustered.info()

#0.0    185812
#1.0    188943
#2.0    143248
#3.0    360046


Cluster
0.0    185812
1.0    188943
2.0    143248
3.0    360046
Name: count, dtype: int64
         DayOfWeek  PdDistrict         X         Y       Day     Month  \
Cluster                                                                  
0.0      -0.068696    3.318650 -0.118703 -0.012461  0.106721 -0.456932   
1.0       0.123613    0.677501  0.611560 -0.014515  0.011297 -0.123174   
2.0      -0.041644    3.317819 -0.127199 -0.013012 -0.141354  0.805412   
3.0      -0.012848    7.452589 -0.209064  0.019225 -0.004766 -0.019990   

             Year      Hour    Minute     Block  
Cluster                                          
0.0      0.419003  0.184235  0.432122 -0.083949  
1.0     -0.155452  0.022396 -0.106612  0.021070  
2.0     -0.299146 -0.310279 -0.449927 -0.048416  
3.0     -0.015643  0.016615  0.011947  0.051530  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------ 

### RandomForestClassifier 

In [8]:
df_clustered["Category"]=data["Category"]

In [9]:
df_sampled = df_clustered.sample(50000, random_state=42)

# Separate features and target
X = df_sampled.drop(columns=['Category'])
y = df_sampled['Category']
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.05, random_state=42, stratify=y)



In [10]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.2688

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.16      0.16      0.16       219
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.17      0.10      0.13       106
           5       0.50      0.09      0.15        11
           6       0.00      0.00      0.00         6
           7       0.35      0.41      0.37       155
           8       0.00      0.00      0.00        12
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00        30
          13       0.06      0.02      0.03        47
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         7
          16       0.32      0.64      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# SMOTE Techniuqe

In [3]:
from imblearn.over_sampling import SMOTE

In [2]:
def apply_smote(df, target_column='Category', random_state=42):

    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # Display original class distribution
    print("Original class distribution:")
    print(y.value_counts())
    print(f"Original shape: {X.shape}")
    
    # Apply SMOTE
    smote = SMOTE(random_state=random_state)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    # Convert back to DataFrame/Series to maintain column names
    X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    y_resampled = pd.Series(y_resampled, name=target_column)
    
    # Display new class distribution
    print("\nBalanced class distribution after SMOTE:")
    print(y_resampled.value_counts())
    print(f"New shape: {X_resampled.shape}")
    
    return X_resampled, y_resampled
X_balanced, y_balanced = apply_smote(df_clustered, target_column='Category')

# If you want to get a complete balanced DataFrame
df = pd.concat([X_balanced, y_balanced], axis=1)

NameError: name 'df_clustered' is not defined

In [1]:
df_sampled = df.sample(50000, random_state=42)

# Separate features and target
X = df_sampled.drop(columns=['Category'])
y = df_sampled['Category']
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.05, random_state=42, stratify=y)

NameError: name 'df' is not defined

In [21]:
X_train.columns

Index(['DayOfWeek', 'PdDistrict', 'X', 'Y', 'Day', 'Month', 'Year', 'Hour',
       'Minute', 'Block', 'Cluster'],
      dtype='object')

In [22]:
y_train.head

<bound method NDFrame.head of 1893628     6
2270057     8
3815608    19
4667528    25
2755976    11
           ..
6671538    38
5893134    32
866041     35
3245048    14
5747569    31
Name: Category, Length: 760000, dtype: int64>

In [23]:
len(y_train.value_counts())

39

In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


# Sampel Size -> Accuracy
#   800,000   -> Accuracy: 0.707025

Accuracy: 0.707025

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.94      0.87      1024
           1       0.35      0.20      0.26      1037
           2       0.90      1.00      0.94      1029
           3       0.91      1.00      0.95      1027
           4       0.42      0.42      0.42      1025
           5       0.68      0.74      0.71      1029
           6       0.73      0.94      0.82      1023
           7       0.44      0.55      0.49      1025
           8       0.66      0.79      0.72      1028
           9       0.83      0.97      0.89      1032
          10       0.92      1.00      0.96      1024
          11       0.90      0.99      0.94      1024
          12       0.62      0.68      0.65      1016
          13       0.57      0.50      0.54      1023
          14       0.95      1.00      0.97      1034
          15       0.77      0.84      0.80      1025
          16       0.35      0.30    

## Great , Accuracy jumped from 25% to 70% after applying SMOTE! 

In [127]:
pip install fuzzy-c-means

Collecting fuzzy-c-means
  Downloading fuzzy_c_means-1.7.2-py3-none-any.whl.metadata (4.7 kB)
Collecting pydantic<3.0.0,>=2.6.4 (from fuzzy-c-means)
  Downloading pydantic-2.11.4-py3-none-any.whl.metadata (66 kB)
     ---------------------------------------- 0.0/66.6 kB ? eta -:--:--
     ------------------------------------ --- 61.4/66.6 kB 1.1 MB/s eta 0:00:01
     -------------------------------------- 66.6/66.6 kB 894.7 kB/s eta 0:00:00
Collecting tabulate<0.9.0,>=0.8.9 (from fuzzy-c-means)
  Downloading tabulate-0.8.10-py3-none-any.whl.metadata (25 kB)
Collecting typer<0.10.0,>=0.9.0 (from fuzzy-c-means)
  Downloading typer-0.9.4-py3-none-any.whl.metadata (14 kB)
Collecting pydantic-core==2.33.2 (from pydantic<3.0.0,>=2.6.4->fuzzy-c-means)
  Downloading pydantic_core-2.33.2-cp312-cp312-win_amd64.whl.metadata (6.9 kB)
Collecting typing-extensions>=4.12.2 (from pydantic<3.0.0,>=2.6.4->fuzzy-c-means)
  Using cached typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collectin

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.32.0 requires packaging<24,>=16.8, but you have packaging 24.1 which is incompatible.


In [25]:
from fcmeans import FCM


In [26]:
from skfuzzy import cmeans

In [27]:
def cluster_data(df, features=None, k_range=range(2, 11), max_samples=10000, verbose=True):
    
    # Sample of Data
    sample_size = min(len(df), max_samples)
    samples = df.sample(n=sample_size)  
    
    # Search for best K
    scores = []
    labels_dict = {}
    
    # Loop through k_range
    for k in k_range:
        
        # Fuzzy C-Means clustering
        cntr, u, u0, d, jm, p, fpc = cmeans(samples.values.T, k, 2, error=0.005, maxiter=1000, init=None)
        
        # Fuzzy labels (taking the highest membership value for each point)
        labels = np.argmax(u, axis=0)
        labels_dict[k] = labels
        
        # Silhouette score
        score = silhouette_score(samples, labels)
        scores.append(score)
    
    # Find optimal k
    best_k = k_range[scores.index(max(scores))]

    df_clustered = df.copy()

    # Add fuzzy cluster column to data
    df_clustered.loc[samples.index, 'fuzzy_cluster'] = labels_dict[best_k]
    
    # Get the best fuzzy model (for assigning the rest of the data)
    cntr_best, u_best, u0_best, d_best, jm_best, p_best, fpc_best = cmeans(samples.values.T, best_k, 2, error=0.005, maxiter=1000, init=None)
    
    if len(df_clustered) > sample_size:
        # Assign remaining points to nearest fuzzy cluster
        remaining_indices = df_clustered.index.difference(samples.index)
        remaining_data = df.loc[remaining_indices]
        
        # Predict fuzzy clusters for remaining points (based on the highest membership)
        remaining_u = np.argmax(cmeans(remaining_data.values.T, best_k, 2, error=0.005, maxiter=1000, init=None)[1], axis=0)
        df_clustered.loc[remaining_indices, 'fuzzy_cluster'] = remaining_u
    
    return df_clustered, best_k, scores

In [28]:
X = df.drop(columns=['Category'])  # Numerical Data
Y = df['Category']

df_clustered, best_k, scores = cluster_data(X)  

In [30]:
df_clustered.columns

Index(['DayOfWeek', 'PdDistrict', 'X', 'Y', 'Day', 'Month', 'Year', 'Hour',
       'Minute', 'Block', 'Cluster', 'fuzzy_cluster'],
      dtype='object')

In [31]:
Data=df_clustered
Data["Category"]=Y

In [32]:
Data.columns

Index(['DayOfWeek', 'PdDistrict', 'X', 'Y', 'Day', 'Month', 'Year', 'Hour',
       'Minute', 'Block', 'Cluster', 'fuzzy_cluster', 'Category'],
      dtype='object')

In [59]:
df_sampled_2 = Data.sample(500000, random_state=42)
y = df_sampled_2['Category']
X = df_sampled_2.drop(columns=['Category', 'Cluster'])

In [60]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.05, random_state=42, stratify=y)

In [61]:
X_train.columns

Index(['DayOfWeek', 'PdDistrict', 'X', 'Y', 'Day', 'Month', 'Year', 'Hour',
       'Minute', 'Block', 'fuzzy_cluster'],
      dtype='object')

In [62]:
y_train.head

<bound method NDFrame.head of 2996139    13
2063646     7
5832339    32
3816104    19
6763797    38
           ..
3424363    15
2675466    11
6067171    33
4088968    21
2781776    11
Name: Category, Length: 475000, dtype: int64>

In [63]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Sampel Size -> Accuracy
#   10,000    -> Accuracy: 0.30
#   40,000    -> Accuracy: 0.39
#   80,000    -> Accuracy: 0.47
#   160,000   -> Accuracy: 0.53
#   300,000   -> Accuracy: 0.60
#   500,000   -> Accuracy: 0.65

Accuracy: 0.6552

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.92      0.83       640
           1       0.27      0.16      0.20       644
           2       0.84      0.99      0.91       639
           3       0.85      0.99      0.92       639
           4       0.37      0.36      0.36       638
           5       0.64      0.64      0.64       643
           6       0.65      0.90      0.75       644
           7       0.38      0.48      0.43       640
           8       0.60      0.70      0.64       646
           9       0.80      0.93      0.86       644
          10       0.87      1.00      0.93       638
          11       0.85      0.98      0.91       635
          12       0.56      0.55      0.55       638
          13       0.48      0.41      0.44       645
          14       0.93      1.00      0.96       646
          15       0.69      0.77      0.73       638
          16       0.30      0.26      