In [1]:
# Importation des bibliothèques
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

# Dataset 🔥🔥

In [2]:
# Import the 911.csv dataset
data = pd.read_csv("src/911.csv")
print("Dataset loaded successfully")

Dataset loaded successfully


In [3]:
data.shape

(663522, 9)

In [4]:

# Échantillonnage de 10 000 observations
data_sample = data.sample(10000, random_state=42)
print("Sample loaded successfully.")
print(data_sample.head())

Sample loaded successfully.
              lat        lng  \
528196  40.169545 -75.249764   
537685  40.069832 -75.316295   
304176  40.154698 -75.139243   
474703  40.382529 -75.473491   
583079  40.091609 -75.138451   

                                                     desc      zip  \
528196  WISSAHICKON AVE & MAPLE AVE;  LOWER GWYNEDD; S...  19422.0   
537685  RAMP I76 EB TO I476  & SCHUYLKILL EXPY EB; WES...      NaN   
304176  WELSH RD & COMPUTER AVE; UPPER MORELAND; 2018-...  19090.0   
474703  BURGUNDY CIR & VINE DR;  UPPER HANOVER; Statio...  18073.0   
583079  E GLENSIDE AVE & CLIFF TER; CHELTENHAM; 2019-1...  19095.0   

                              title            timeStamp                twp  \
528196       EMS: CARDIAC EMERGENCY  2019-07-31 05:13:54      LOWER GWYNEDD   
537685  Traffic: VEHICLE ACCIDENT -  2019-08-23 13:32:45  WEST CONSHOHOCKEN   
304176             Fire: FIRE ALARM  2018-02-02 04:08:55     UPPER MORELAND   
474703             EMS: FALL VICTIM  2019-

In [5]:
data_sample.shape

(10000, 9)

In [6]:
# Visualisation initiale avec Plotly
fig = px.scatter_mapbox(
    data_sample,
    lat="lat",
    lon="lng",
    color="title",
    mapbox_style="carto-positron",
    title="Échantillon des données 911"
)
fig.update_layout(margin={"r": 0, "t": 30, "l": 0, "b": 0})
fig.show()

In [7]:
# Sélection des colonnes nécessaires
data_sample = data_sample.loc[:, ["lat", "lng", "title"]]
print("Data subset with selected columns:")
print(data_sample.head())

Data subset with selected columns:
              lat        lng                        title
528196  40.169545 -75.249764       EMS: CARDIAC EMERGENCY
537685  40.069832 -75.316295  Traffic: VEHICLE ACCIDENT -
304176  40.154698 -75.139243             Fire: FIRE ALARM
474703  40.382529 -75.473491             EMS: FALL VICTIM
583079  40.091609 -75.138451  Traffic: DISABLED VEHICLE -


In [8]:
# Préparation des colonnes numériques et catégorielles
numeric_features = ["lat", "lng"]
categorical_features = ["title"]

In [9]:
# Création des transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop="first", sparse_output=False)

# Création du préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Prétraitement des données
print("Preprocessing data sample...")
X = preprocessor.fit_transform(data_sample)  # Retourne un numpy array dense
print("Preprocessing completed.")
print("First 5 rows of the processed data:")
print(X[:5, :])

Preprocessing data sample...
Preprocessing completed.
First 5 rows of the processed data:
[[ 0.0871089   0.35654627  0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   1.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0

In [10]:
# Application de DBSCAN
print("Clustering with DBSCAN...")
db = DBSCAN(eps=0.2, min_samples=100, metric="manhattan")
db.fit(X)

Clustering with DBSCAN...


In [11]:
# Vérification des clusters trouvés
unique_labels = np.unique(db.labels_)
print(f"Unique cluster labels: {unique_labels}")

Unique cluster labels: [-1  0  1]


In [12]:
# Ajout des labels de cluster au DataFrame original
data_sample["cluster"] = db.labels_
print("Clusters added to the dataset:")
print(data_sample.head())

Clusters added to the dataset:
              lat        lng                        title  cluster
528196  40.169545 -75.249764       EMS: CARDIAC EMERGENCY       -1
537685  40.069832 -75.316295  Traffic: VEHICLE ACCIDENT -       -1
304176  40.154698 -75.139243             Fire: FIRE ALARM       -1
474703  40.382529 -75.473491             EMS: FALL VICTIM       -1
583079  40.091609 -75.138451  Traffic: DISABLED VEHICLE -       -1


In [13]:
# Visualisation des clusters (excluant les points bruités)
fig = px.scatter_mapbox(
    data_sample[data_sample.cluster != -1],
    lat="lat",
    lon="lng",
    color="cluster",
    mapbox_style="carto-positron",
    title="Clusters détectés par DBSCAN"
)
fig.update_layout(margin={"r": 0, "t": 30, "l": 0, "b": 0})
fig.show()

In [14]:

# Visualisation alternative des points bruités
fig = px.scatter_mapbox(
    data_sample,
    lat="lat",
    lon="lng",
    color="cluster",
    mapbox_style="carto-positron",
    title="Clusters et points bruités détectés par DBSCAN"
)
fig.update_layout(margin={"r": 0, "t": 30, "l": 0, "b": 0})
fig.show()

The map shows the main topics to focus on and the main areas where this events occur. Therefore these are the areas that politics should focus on.