<h2 align="center"> Data Mining and Machine Learning </h2>
<h3 align="center"> Final Project </h3>
<h2 align="center"> <b> <i> CrashSpot </i> </b> </h2>
<h4 align="center"> Lorenzo Ceccanti matr. 564490 </h4>

### <b> Geografical Clustering with KMeans</b>

In [14]:
k_kmeans = 46

In [15]:
import os
import pandas as pd
import numpy as np

df = pd.read_csv(os.path.join('../editedDataset', 'CLEANED_brasilEnglishAggr.csv'))

In [16]:
import plotly.express as px
import pandas as pd

def plot_map(data: pd.DataFrame, text: str, color_col: str, zoom_factor=3):
    if color_col == "labels":
        # Variabile numerica/continua
        cont_scale = [
            [0.0,  "gray"],
            [0.1,  "cyan"],
            [0.2,  "lightblue"],
            [0.3,  "blue"],
            [0.4,  "green"],
            [0.5,  "lightgreen"],
            [0.6,  "yellow"],
            [0.7,  "gold"],
            [0.8,  "orange"],
            [0.9,  "orangered"],
            [1.0,  "red"]
        ]
        fig = px.scatter_map(
            data,
            lat="latitude",
            lon="longitude",
            color=color_col,  # nome colonna
            color_continuous_scale=cont_scale,
            hover_data=["date", color_col]
        )
    else:
        # Variabile categorica (testuale)
        discrete_map = {
            "Without victims": "green",
            "With injured victims": "orange",
            "With dead victims": "red"
        }
        fig = px.scatter_map(
            data,
            lat="latitude",
            lon="longitude",
            color=color_col,     # nome colonna
            color_discrete_map=discrete_map,
            category_orders={color_col: ["Without victims", "With injured victims", "With dead victims"]},
            hover_data=["date", color_col]
        )

    fig.update_layout(
        mapbox_style="carto-positron",
        title=f"Map plot ({text})"
    )
    fig.show()


In [17]:
# The sklearn library for KMeans doesn't allow for change the metric
# used to compute the distance. For this reason, I'll rely on the library
# pyclustering, which allow to use user-defined distance metrics
import random
from pyclustering.utils.metric import distance_metric, type_metric
from haversine import haversine, Unit
from pyclustering.cluster.kmeans import kmeans
from sklearn import metrics

def random_init(coords_list, k, seed=42):
    random.Random(seed).shuffle(coords_list)
    return coords_list[:k]

# Takes the gpsCoords as a dataframe and returns a tuple (lat, lon)
# a must be the first point as a list [lat, lon]
# b must be the second point as a list [lat, lon]
def my_haversine(a, b):
    return haversine(tuple(a), tuple(b), unit=Unit.KILOMETERS)

# Returns an array of arrays of points
# Example
# [
#     [41.9028, 12.4964],   # Roma
#     [48.8566, 2.3522],    # Parigi
#     [51.5074, -0.1278],   # Londra
#     [40.7128, -74.0060],  # New York
#     [34.0522, -118.2437], # Los Angeles
#     [35.6895, 139.6917],  # Tokyo
# ]
def df_to_matrix(gpsCoords):
    all_latitude_list = gpsCoords.loc[:,'latitude'].values.tolist()
    all_longitude_list = gpsCoords.loc[:,'latitude'].values.tolist()
    coords_list = []
    for i in range(gpsCoords.shape[0]):
        temp_list = [all_latitude_list[i], all_longitude_list[i]]
        coords_list.append(temp_list)
    return coords_list

def runKMeans(k, gpsCoords):
    # The haversine library automatically converts the GPS coordinates in radiants
    haversine_metric = distance_metric(type_metric.USER_DEFINED, func=my_haversine)
    coords_list = df_to_matrix(gpsCoords)
   
    # We calculate initial centers using K-Means++ method.
    # The initialier method expects as first parameter the data in the same format we've described before
    # Second parameter is the number of centroids
    
    # doesn't work, dependency conflict
    # initial_centers = kmeans_plusplus_initializer(coords_list, k, metric=haversine_metric).initialize()
    initial_centers = random_init(coords_list[:], k)
   
    kmeans_instance = kmeans(coords_list, initial_centers, metric=haversine_metric)
    kmeans_instance.process()
    clusters = kmeans_instance.get_clusters()
    centers = kmeans_instance.get_centers()

    return clusters, centers

def append_labels_df(gpsCoords, clusters):
    gpsCoords['labels'] = pd.NA
    for index, list in enumerate(clusters):
        for elem in list:
            gpsCoords.loc[gpsCoords["index"] == elem, "labels"] = index
    return gpsCoords

def get_kMeans_metrics(df):
    labels = df['labels'].values.tolist()
    # Here, since we are again relying on sklearn metrics, we need to
    # reason in radiants

    data = df[['latitude', 'longitude']].copy()
    data_rad = np.radians(data).to_numpy()

    # Davies-Bouldin Index
    davies_bouldin_index = metrics.davies_bouldin_score(data_rad, labels)

    # Silouette Coefficent
    sil_coefficent = metrics.silhouette_score(data_rad, labels, metric='haversine')

    # Calinski-Harabasz index
    calinski_index = metrics.calinski_harabasz_score(data_rad, labels)

    return davies_bouldin_index, sil_coefficent, calinski_index


In [18]:
def launch(param, query, cause_to_analyze, k_kmeans):
    """ Launch the clustering proces
    Args:
        param: Could be City or State
        query: The name of the city/state
    """
    if param == "City":
        # We select a specific city
        df_selection = df.copy().query(f"city == '{query}'")
    if param == "State":
        # We select a specific state
        df_selection = df.copy().query(f"state == '{query}'")

    cause_mapping = {
        "Brake slam": ["Abrupt use of the car's brake"],
        "Minor traffic offense": ["Absence of sinalization",
                                "Disobedience to laws of transit by the pedestrian",
                                "car's on sidewalk"],
        "Traffic offense": ["Driver broke the laws of transit", "Irregular access",
                            "Lane change maneuver",
                            "Stopping at a prohibited place",
                            "The driver passed the next car improperly",
                            "Traffic with a motorcycle (or similar) between lanes",
                            "Acessing the road without seeing the presence of other vehicles"],
        "Major traffic offense": ["Disrespecting the intersection", 
                                "Driver changed the lane illegally",
                                "Driver disrespected the red traffic light",
                                "Driver was in the opposite direction",
                                "Driving on the breakdown lane",
                                "Prohibited conversion"],
        "Driver distraction": ["Driver using cellphone",
                            "Driver was sleeping",
                            "Driver's lack of reaction",
                            "Driver's lack of attention to conveyance"],
        "Road defect":  ["Inadequate sinalization of the road",
                        "Curvy road", "No breakdown lanes", "Other flaws/problems in the road",
                        "Poor ilumination (of the road)",
                        "Road's defect",
                        "Roads with holes without cement",
                        "Sinking or ondulation in the pavement",
                        "Slippery track",
                        "Uneven breakdown lane",
                        "Unlevel track",
                        "Urban area without appropriate pedestrian walking"],
        "Road condition": ["Accumulation of water on the road", "Fog",
                        "Natural phenomena",
                        "Obstacle in the road",
                        "Oil accumulation on the road",
                        "Rain",
                        "Road had lots of sand/wreckage",
                        "Road works (in maintenance)",
                        "Static object on the drainage gate",
                        "Visibility restriction"],
        "Alcohol": ["Alcohol and/or drug ingestion by the pedestrian", "Alcohol consumption",
                    "Alcohol ingestion by the driver"],
        "Drugs": ["Driver was using drugs"],
        "Driver behavior": ["External fight"],
        "Animals": ["Animals on the road"],
        "Veichle not human fault": ["Car's brake problem", 
                        "Car's suspension system with problems", 
                        "Deficiency of vehicle's sinalization/ilumination system",
                        "Electrical or mechanical flaws",
                        "Mechanical loss/defect of vehicle"],
        "Veichle human fault": ["Excessive load/cargo", "Excessive use of the car's tire"],
        "Driver health": ["Cardiac attack", "Driver had a cardiac attack"],
        "Safe distance": ["Disrespect of safe distance from the next car",
                        "Driver failed to keep distance from the vehicle in front"],
        "High speed": ["Incompatible velocity"],
        "Pedestrian involved": ["Pedestrian was crossing the road outside of the crosswalk",
                                "Pedestrian was walking in the road",
                                "Pedestrian's lack of attention",
                                "Unexpected pedestrian entry"]
    }

    # Since for Pandas it's more convenient to have the specific causes as key, we reverse the mapping of the dictionary
    reverse_mapping = {specific: general 
                       for general, specifics in cause_mapping.items() 
                       for specific in specifics}
    df_selection["general_cause_of_accident"] = df_selection["cause_of_accident"].map(reverse_mapping)
    
    for cause in cause_to_analyze:
        df_filtered = df_selection.query(f"general_cause_of_accident == '{cause}'").copy()
        gpsCoords = df_filtered[['latitude', 'longitude']].copy()
        gpsCoords["index"] = range(len(gpsCoords))

        clusters_, centers_ = runKMeans(k=k_kmeans, gpsCoords=gpsCoords)
        gpsCoords = append_labels_df(gpsCoords, clusters=clusters_)
        dbi_index, sil_index, cal_index = get_kMeans_metrics(gpsCoords)
        print("Davies Bouldin Index: ", dbi_index)
        print("Silhouette Coefficent: ", sil_index)
        print("Calinski-Harabasz Index: ", cal_index)

        df_joined = pd.merge(df_filtered, gpsCoords.loc[:,'labels'], left_index=True, right_index=True)
        df_filtered = df_joined.copy()
        data_to_plot = df_filtered[df_filtered['labels'] > -1].copy()
        plot_map(data = data_to_plot, text = f'{param}: {query} - KMeans', color_col='labels')  # outliers are excluded

In [19]:
cause_to_analyze = ["High speed"]
launch('City', 'BRASILIA', cause_to_analyze, k_kmeans)

Davies Bouldin Index:  5.773975827561263
Silhouette Coefficent:  0.09501505709625935
Calinski-Harabasz Index:  88.25904324178344


In [20]:
k_kmeans = 50
cause_to_analyze = ["High speed"]
launch('City', 'CURITIBA', cause_to_analyze, k_kmeans)

Davies Bouldin Index:  5.531844257838722
Silhouette Coefficent:  0.03854346533904551
Calinski-Harabasz Index:  21.292656849639272


In [21]:
k_kmeans = 170
cause_to_analyze = ["High speed"]
launch('State', 'SP', cause_to_analyze, k_kmeans)

Davies Bouldin Index:  12.620134698482483
Silhouette Coefficent:  0.08545901676539619
Calinski-Harabasz Index:  306.2936704376928


In [23]:
k_kmeans = 250
cause_to_analyze = ["High speed"]
launch('State', 'RJ', cause_to_analyze, k_kmeans)

Davies Bouldin Index:  26.686033413347328
Silhouette Coefficent:  -0.4322953805009742
Calinski-Harabasz Index:  48.70690687095577
