# Preparation of Incidents for Clustering

In [1]:
# Import required libraries
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()  # Loads from .env by default

## Define base path (adjust if notebook is in different location)
BASE_PATH = os.getenv('BASE_PATH')

# Verify base path exists
if not BASE_PATH or not os.path.exists(BASE_PATH):
    raise ValueError(f"Invalid BASE_PATH: {BASE_PATH}. Check your .env file and directory structure")

In [2]:
raw = pd.read_parquet(os.path.join(BASE_PATH, 'data/2_Silver/incidences.parquet'))

In [3]:
df = raw.copy()

In [4]:
df.head()

Unnamed: 0,COD_UPZ,UPZ,LOCALIDAD,FIGHTS_INCIDENTS,MINOR_COLLISIONS_INCIDENTS,ROBBERIES_INCIDENTS,THEFT_INCIDENTS,INTOXICATION_INCIDENTS,VANDALISM_INCIDENTS,SEXUAL_VIOLENCE_INCIDENTS,MAJOR_COLLISION_INCIDENTS
0,UPZ28,EL RINCON,SUBA,182839,43097,24861,11710,22261.0,8431.0,1569.0,24733.0
1,UPZ71,TIBABUYES,SUBA,179782,22284,17812,10626,15719.0,7469.0,1275.0,13205.0
2,UPZ84,BOSA OCCIDENTAL,BOSA,137817,35684,21991,8645,10533.0,6172.0,1106.0,23056.0
3,UPZ85,BOSA CENTRAL,BOSA,127394,30567,17713,7811,12213.0,5285.0,1269.0,16359.0
4,UPZ85,BOSA CENTRAL,BOSA,127394,30567,17713,7811,12213.0,5285.0,1269.0,16359.0


In [7]:
# drop the COD_UPZ	UPZ	LOCALIDAD columns
df.drop(columns=['COD_UPZ', 'UPZ', 'LOCALIDAD'], inplace=True)

In [8]:
df.head()

Unnamed: 0,FIGHTS_INCIDENTS,MINOR_COLLISIONS_INCIDENTS,ROBBERIES_INCIDENTS,THEFT_INCIDENTS,INTOXICATION_INCIDENTS,VANDALISM_INCIDENTS,SEXUAL_VIOLENCE_INCIDENTS,MAJOR_COLLISION_INCIDENTS
0,182839,43097,24861,11710,22261.0,8431.0,1569.0,24733.0
1,179782,22284,17812,10626,15719.0,7469.0,1275.0,13205.0
2,137817,35684,21991,8645,10533.0,6172.0,1106.0,23056.0
3,127394,30567,17713,7811,12213.0,5285.0,1269.0,16359.0
4,127394,30567,17713,7811,12213.0,5285.0,1269.0,16359.0


In [14]:
#Now lets Normalize the values of the columns
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [15]:
df_scaled.head()

Unnamed: 0,FIGHTS_INCIDENTS,MINOR_COLLISIONS_INCIDENTS,ROBBERIES_INCIDENTS,THEFT_INCIDENTS,INTOXICATION_INCIDENTS,VANDALISM_INCIDENTS,SEXUAL_VIOLENCE_INCIDENTS,MAJOR_COLLISION_INCIDENTS
0,1.0,0.76486,1.0,1.0,1.0,1.0,1.0,0.600709
1,0.983275,0.395226,0.716258,0.907374,0.70607,0.885884,0.8125,0.32072
2,0.753679,0.633207,0.884474,0.738101,0.473065,0.732028,0.704719,0.559979
3,0.696654,0.54233,0.712273,0.666838,0.548547,0.626809,0.808673,0.397323
4,0.696654,0.54233,0.712273,0.666838,0.548547,0.626809,0.808673,0.397323


In [16]:
# Now we can save the normalized data
df_scaled.to_parquet(os.path.join(BASE_PATH, 'data/3_Gold/incidences_normalized.parquet'))

In [17]:
X = df_scaled.select_dtypes(include=["number"]).values

In [18]:
X

array([[1.00000000e+00, 7.64860497e-01, 1.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 6.00709203e-01],
       [9.83274792e-01, 3.95226171e-01, 7.16258101e-01, ...,
        8.85883749e-01, 8.12500000e-01, 3.20719889e-01],
       [7.53679327e-01, 6.33207239e-01, 8.84474500e-01, ...,
        7.32028470e-01, 7.04719388e-01, 5.59978627e-01],
       ...,
       [3.55622668e-04, 1.69605910e-02, 1.52960593e-03, ...,
        1.06761566e-03, 0.00000000e+00, 1.23867583e-03],
       [6.01822977e-05, 2.06013462e-03, 8.05055750e-04, ...,
        1.18623962e-04, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.13117374e-04, 0.00000000e+00, ...,
        0.00000000e+00, 6.37755102e-04, 0.00000000e+00]])