# Preparation of Incidents for Clustering

In [1]:
# Import required libraries
import pandas as pd
import os
from dotenv import load_dotenv

import joblib

# Load environment variables
load_dotenv()  # Loads from .env by default

## Define base path (adjust if notebook is in different location)
BASE_PATH = os.getenv('BASE_PATH')

# Verify base path exists
if not BASE_PATH or not os.path.exists(BASE_PATH):
    raise ValueError(f"Invalid BASE_PATH: {BASE_PATH}. Check your .env file and directory structure")

In [2]:
raw = pd.read_parquet(os.path.join(BASE_PATH, 'data/2_Silver/incidences.parquet'))

In [3]:
df = raw.copy()

In [4]:
df.head()

Unnamed: 0,COD_UPZ,UPZ,LOCALIDAD,FIGHTS_INCIDENTS,MINOR_COLLISIONS_INCIDENTS,ROBBERIES_INCIDENTS,THEFT_INCIDENTS,INTOXICATION_INCIDENTS,VANDALISM_INCIDENTS,SEXUAL_VIOLENCE_INCIDENTS,MAJOR_COLLISION_INCIDENTS
0,UPZ28,EL RINCON,SUBA,182839,43097,24861,11710,22261.0,8431.0,1569.0,24733.0
1,UPZ71,TIBABUYES,SUBA,179782,22284,17812,10626,15719.0,7469.0,1275.0,13205.0
2,UPZ84,BOSA OCCIDENTAL,BOSA,137817,35684,21991,8645,10533.0,6172.0,1106.0,23056.0
3,UPZ85,BOSA CENTRAL,BOSA,127394,30567,17713,7811,12213.0,5285.0,1269.0,16359.0
4,UPZ85,BOSA CENTRAL,BOSA,127394,30567,17713,7811,12213.0,5285.0,1269.0,16359.0


In [5]:
# save 'COD_UPZ', 'UPZ', 'LOCALIDAD' columns to be concatenated with the data after the clustering, don't drop anything
df_mapping = df[['COD_UPZ', 'UPZ', 'LOCALIDAD']]

In [6]:
# drop the COD_UPZ	UPZ	LOCALIDAD columns
df.drop(columns=['COD_UPZ', 'UPZ', 'LOCALIDAD'], inplace=True)

In [7]:
df.head()

Unnamed: 0,FIGHTS_INCIDENTS,MINOR_COLLISIONS_INCIDENTS,ROBBERIES_INCIDENTS,THEFT_INCIDENTS,INTOXICATION_INCIDENTS,VANDALISM_INCIDENTS,SEXUAL_VIOLENCE_INCIDENTS,MAJOR_COLLISION_INCIDENTS
0,182839,43097,24861,11710,22261.0,8431.0,1569.0,24733.0
1,179782,22284,17812,10626,15719.0,7469.0,1275.0,13205.0
2,137817,35684,21991,8645,10533.0,6172.0,1106.0,23056.0
3,127394,30567,17713,7811,12213.0,5285.0,1269.0,16359.0
4,127394,30567,17713,7811,12213.0,5285.0,1269.0,16359.0


In [8]:
#Now lets Normalize the values of the columns
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [9]:
df_scaled.head()

Unnamed: 0,FIGHTS_INCIDENTS,MINOR_COLLISIONS_INCIDENTS,ROBBERIES_INCIDENTS,THEFT_INCIDENTS,INTOXICATION_INCIDENTS,VANDALISM_INCIDENTS,SEXUAL_VIOLENCE_INCIDENTS,MAJOR_COLLISION_INCIDENTS
0,1.0,0.76486,1.0,1.0,1.0,1.0,1.0,0.600709
1,0.983275,0.395226,0.716258,0.907374,0.70607,0.885884,0.8125,0.32072
2,0.753679,0.633207,0.884474,0.738101,0.473065,0.732028,0.704719,0.559979
3,0.696654,0.54233,0.712273,0.666838,0.548547,0.626809,0.808673,0.397323
4,0.696654,0.54233,0.712273,0.666838,0.548547,0.626809,0.808673,0.397323


In [10]:
#save the scaler.scale_ 
joblib.dump(scaler, os.path.join(BASE_PATH, 'models/incidences_scaler.pkl'))

['/Users/onassisnottage/Desktop/SMART_CITY_AI/Bogota_Colombia/bogota-smart-city-ai-repo/models/incidences_scaler.pkl']

In [14]:
# Now we can save the normalized data
df_scaled.to_parquet(os.path.join(BASE_PATH, 'data/3_Gold/incidences_normalized.parquet'))
df.to_parquet(os.path.join(BASE_PATH, 'data/3_Gold/incidences.parquet'))

In [11]:
X = df_scaled.select_dtypes(include=["number"]).values

In [12]:
X

array([[1.00000000e+00, 7.64860497e-01, 1.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 6.00709203e-01],
       [9.83274792e-01, 3.95226171e-01, 7.16258101e-01, ...,
        8.85883749e-01, 8.12500000e-01, 3.20719889e-01],
       [7.53679327e-01, 6.33207239e-01, 8.84474500e-01, ...,
        7.32028470e-01, 7.04719388e-01, 5.59978627e-01],
       ...,
       [3.55622668e-04, 1.69605910e-02, 1.52960593e-03, ...,
        1.06761566e-03, 0.00000000e+00, 1.23867583e-03],
       [6.01822977e-05, 2.06013462e-03, 8.05055750e-04, ...,
        1.18623962e-04, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.13117374e-04, 0.00000000e+00, ...,
        0.00000000e+00, 6.37755102e-04, 0.00000000e+00]])

In [13]:
# perform k means clustering on X
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=0).fit(X)

# add the cluster labels to the dataframe
df['cluster'] = kmeans.labels_ + 1

In [14]:
df.head()

Unnamed: 0,FIGHTS_INCIDENTS,MINOR_COLLISIONS_INCIDENTS,ROBBERIES_INCIDENTS,THEFT_INCIDENTS,INTOXICATION_INCIDENTS,VANDALISM_INCIDENTS,SEXUAL_VIOLENCE_INCIDENTS,MAJOR_COLLISION_INCIDENTS,cluster
0,182839,43097,24861,11710,22261.0,8431.0,1569.0,24733.0,2
1,179782,22284,17812,10626,15719.0,7469.0,1275.0,13205.0,2
2,137817,35684,21991,8645,10533.0,6172.0,1106.0,23056.0,2
3,127394,30567,17713,7811,12213.0,5285.0,1269.0,16359.0,2
4,127394,30567,17713,7811,12213.0,5285.0,1269.0,16359.0,2


In [15]:
# sum all the incidents except cluster into a column called 'total_incidents' 
df['total_incidents'] = df.drop(columns='cluster').sum(axis=1)

In [16]:
# Let's analyze the clusters
df.groupby('cluster').mean().sort_values('total_incidents', ascending=False)

Unnamed: 0_level_0,FIGHTS_INCIDENTS,MINOR_COLLISIONS_INCIDENTS,ROBBERIES_INCIDENTS,THEFT_INCIDENTS,INTOXICATION_INCIDENTS,VANDALISM_INCIDENTS,SEXUAL_VIOLENCE_INCIDENTS,MAJOR_COLLISION_INCIDENTS,total_incidents
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,123641.708333,31171.173611,17748.361111,7758.263889,11932.416667,5187.527778,1234.423611,16623.847222,215297.722222
3,29905.624113,14650.255319,7889.074468,1748.27305,2068.989362,1659.219858,360.585106,17072.067376,75354.088652
5,15134.325758,15744.799242,5563.715909,1427.056818,1166.340909,1045.530303,447.477273,13923.905303,54453.151515
1,21441.265781,8198.202658,5919.122924,1507.093023,1926.990033,951.541528,338.415282,4170.813953,44453.445183
4,3151.478788,696.49697,293.151515,247.424242,343.048485,140.472727,32.812121,259.375758,5164.260606


In [17]:
# For the clusters, lets Label 4 'A', Label 1 'B', Label 5 'C', Label 3 'D', Label 2 'E'
df['cluster'] = df['cluster'].map({4: 'A', 1: 'B', 5: 'C', 3: 'D', 2: 'E'})

In [18]:
df.groupby('cluster').mean().sort_values('total_incidents', ascending=False)

Unnamed: 0_level_0,FIGHTS_INCIDENTS,MINOR_COLLISIONS_INCIDENTS,ROBBERIES_INCIDENTS,THEFT_INCIDENTS,INTOXICATION_INCIDENTS,VANDALISM_INCIDENTS,SEXUAL_VIOLENCE_INCIDENTS,MAJOR_COLLISION_INCIDENTS,total_incidents
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
E,123641.708333,31171.173611,17748.361111,7758.263889,11932.416667,5187.527778,1234.423611,16623.847222,215297.722222
D,29905.624113,14650.255319,7889.074468,1748.27305,2068.989362,1659.219858,360.585106,17072.067376,75354.088652
C,15134.325758,15744.799242,5563.715909,1427.056818,1166.340909,1045.530303,447.477273,13923.905303,54453.151515
B,21441.265781,8198.202658,5919.122924,1507.093023,1926.990033,951.541528,338.415282,4170.813953,44453.445183
A,3151.478788,696.49697,293.151515,247.424242,343.048485,140.472727,32.812121,259.375758,5164.260606


In [19]:
df.head()

Unnamed: 0,FIGHTS_INCIDENTS,MINOR_COLLISIONS_INCIDENTS,ROBBERIES_INCIDENTS,THEFT_INCIDENTS,INTOXICATION_INCIDENTS,VANDALISM_INCIDENTS,SEXUAL_VIOLENCE_INCIDENTS,MAJOR_COLLISION_INCIDENTS,cluster,total_incidents
0,182839,43097,24861,11710,22261.0,8431.0,1569.0,24733.0,E,319501.0
1,179782,22284,17812,10626,15719.0,7469.0,1275.0,13205.0,E,268172.0
2,137817,35684,21991,8645,10533.0,6172.0,1106.0,23056.0,E,245004.0
3,127394,30567,17713,7811,12213.0,5285.0,1269.0,16359.0,E,218611.0
4,127394,30567,17713,7811,12213.0,5285.0,1269.0,16359.0,E,218611.0


In [20]:
# append the df_mapping to the df
df_final = df_mapping.join(df)

In [21]:
df_final.head()

Unnamed: 0,COD_UPZ,UPZ,LOCALIDAD,FIGHTS_INCIDENTS,MINOR_COLLISIONS_INCIDENTS,ROBBERIES_INCIDENTS,THEFT_INCIDENTS,INTOXICATION_INCIDENTS,VANDALISM_INCIDENTS,SEXUAL_VIOLENCE_INCIDENTS,MAJOR_COLLISION_INCIDENTS,cluster,total_incidents
0,UPZ28,EL RINCON,SUBA,182839,43097,24861,11710,22261.0,8431.0,1569.0,24733.0,E,319501.0
1,UPZ71,TIBABUYES,SUBA,179782,22284,17812,10626,15719.0,7469.0,1275.0,13205.0,E,268172.0
2,UPZ84,BOSA OCCIDENTAL,BOSA,137817,35684,21991,8645,10533.0,6172.0,1106.0,23056.0,E,245004.0
3,UPZ85,BOSA CENTRAL,BOSA,127394,30567,17713,7811,12213.0,5285.0,1269.0,16359.0,E,218611.0
4,UPZ85,BOSA CENTRAL,BOSA,127394,30567,17713,7811,12213.0,5285.0,1269.0,16359.0,E,218611.0
