# MASTER - Notebook 4
### Matteo Grazioso 884055

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
from datetime import datetime
import json
import os
import folium
import warnings
warnings.filterwarnings('ignore')

import myfunctions as mf # Custom functions

In [2]:
# Disply all columns and all rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
path = 'data/raw/2-esportazioneCompleta.txt'
df = pd.read_csv(path, header=0, sep='\t')

# Save the name of the file in a variable for future use extracting the name of the file from the path
file_name = path.split('_')[-1].split('/')[2]

# Display the first 5 rows of the dataframe
df.head()

Unnamed: 0,DATA_VALIDAZIONE,SERIALE,FERMATA,DESCRIZIONE,TITOLO,DESCRIZIONE_TITOLO
0,13/01/2023 00:00,40834866809772548,162,Stazione MES,12101,Bigl.Aut.75'Mestre/Lido-tsc
1,13/01/2023 00:00,42242241686217732,3625,Aeroporto MA,12106,Bigl Aer-Venezia TSC
2,13/01/2023 00:00,42242241686217476,3625,Aeroporto MA,12106,Bigl Aer-Venezia TSC
3,13/01/2023 00:00,-3604990320,5049,"Zattere ""B""",23301,Mens.Studente Rete Unica
4,13/01/2023 00:00,-2824230951,5043,"S. Toma' ""B""",23303,Abb stud. ReteUnica 12 mesi


In [4]:
file_name

'2-esportazioneCompleta.txt'

In [5]:
# Dates and hour of the validation of the ticket are in the same column 'DATA_VALIDAZIONE'
# Split the column 'DATA_VALIDAZIONE' into two columns 'DATA' and 'ORA' and convert them to datetime format
df.insert(0, 'DATA', pd.to_datetime(df['DATA_VALIDAZIONE'].str.split(' ').str[0], format='%d/%m/%Y'))
df.insert(1, 'ORA', pd.to_datetime(df['DATA_VALIDAZIONE'].str.split(' ').str[1], format='%H:%M').dt.time)

# Display the first 5 rows of the dataframe
df.head()

Unnamed: 0,DATA,ORA,DATA_VALIDAZIONE,SERIALE,FERMATA,DESCRIZIONE,TITOLO,DESCRIZIONE_TITOLO
0,2023-01-13,00:00:00,13/01/2023 00:00,40834866809772548,162,Stazione MES,12101,Bigl.Aut.75'Mestre/Lido-tsc
1,2023-01-13,00:00:00,13/01/2023 00:00,42242241686217732,3625,Aeroporto MA,12106,Bigl Aer-Venezia TSC
2,2023-01-13,00:00:00,13/01/2023 00:00,42242241686217476,3625,Aeroporto MA,12106,Bigl Aer-Venezia TSC
3,2023-01-13,00:00:00,13/01/2023 00:00,-3604990320,5049,"Zattere ""B""",23301,Mens.Studente Rete Unica
4,2023-01-13,00:00:00,13/01/2023 00:00,-2824230951,5043,"S. Toma' ""B""",23303,Abb stud. ReteUnica 12 mesi


In [6]:
# Convert the column 'DATA' to datetime format
df['DATA'] = pd.to_datetime(df['DATA'], format='%Y-%m-%d')

In [7]:
df.head()

Unnamed: 0,DATA,ORA,DATA_VALIDAZIONE,SERIALE,FERMATA,DESCRIZIONE,TITOLO,DESCRIZIONE_TITOLO
0,2023-01-13,00:00:00,13/01/2023 00:00,40834866809772548,162,Stazione MES,12101,Bigl.Aut.75'Mestre/Lido-tsc
1,2023-01-13,00:00:00,13/01/2023 00:00,42242241686217732,3625,Aeroporto MA,12106,Bigl Aer-Venezia TSC
2,2023-01-13,00:00:00,13/01/2023 00:00,42242241686217476,3625,Aeroporto MA,12106,Bigl Aer-Venezia TSC
3,2023-01-13,00:00:00,13/01/2023 00:00,-3604990320,5049,"Zattere ""B""",23301,Mens.Studente Rete Unica
4,2023-01-13,00:00:00,13/01/2023 00:00,-2824230951,5043,"S. Toma' ""B""",23303,Abb stud. ReteUnica 12 mesi


In [8]:
df.tail()

Unnamed: 0,DATA,ORA,DATA_VALIDAZIONE,SERIALE,FERMATA,DESCRIZIONE,TITOLO,DESCRIZIONE_TITOLO
5537461,2023-03-14,23:58:00,14/03/2023 23:58,-2864643315,162,Stazione MES,11209,Bigl RETE UNICA 75'
5537462,2023-03-14,23:58:00,14/03/2023 23:58,-2854956628,5026,Tronchetto F,11209,Bigl RETE UNICA 75'
5537463,2023-03-14,23:59:00,14/03/2023 23:59,-2850025054,384,Mestre Centr,23101,Mensile ordinario Rete Unica
5537464,2023-03-14,23:59:00,14/03/2023 23:59,-2824225710,5024,"Tronchetto """,23101,Mensile ordinario Rete Unica
5537465,2023-03-14,23:59:00,14/03/2023 23:59,-3604916033,5039,"Rialto ""C""",23101,Mensile ordinario Rete Unica


---------


In [9]:
# Restrict the dataset to only the specified period given by the user
def restrict_dataset_to_period(df, start_date, end_date):
    '''
    Restrict the dataset to only the specified period given by the user
        :param df: the dataset to be restricted
        :param start_date: the start date of the period
        :param end_date: the end date of the period
        :return: the restricted dataset        
    ''' 

    # Filter the dataset to only the specified period
    df = df[(df['DATA'] >= start_date) & (df['DATA'] <= end_date)]
    
    return df

In [10]:
# If does not exist a file named data/processed/dataset_cleaned_temp02-04_2023-02-21_2-esportazioneCompleta.txt create it
if not os.path.exists('data/processed/dataset_cleaned_temp02-04_2023-02-21_2-esportazioneCompleta.txt'):
    print('df.shape: ', df.shape)
    start_date = '2023-02-04'
    end_date = '2023-02-21'
    df = restrict_dataset_to_period(df, start_date, end_date)

    # Export the data to a txt file
    name_file = 'restricted_' + str(start_date) + '_' + str(end_date) + '_' + file_name
    df.to_csv('data/raw/' + name_file, sep='\t', index=False)

    print('Data exported to ' + name_file)
else :
    print('Data already exported')
    print('Path: data/processed/dataset_cleaned_temp02-04_2023-02-21_2-esportazioneCompleta.txt')
    # Open the file
    df = pd.read_csv('data/processed/dataset_cleaned_temp02-04_2023-02-21_2-esportazioneCompleta.txt', sep='\t')
    # Print information about the dataset
    print('df.shape: ', df.shape)
    print('df.head(2)')
    print(df.head(2))

    # Print the interval of dates (DATA column) of the dataset
    # Min date
    print('Min date: ', df['DATA'].min())
    # Max date
    print('Max date: ', df['DATA'].max())

Data already exported
Path: data/processed/dataset_cleaned_temp02-04_2023-02-21_2-esportazioneCompleta.txt
df.shape:  (1657505, 9)
df.head(2)
         DATA       ORA     DATA_VALIDAZIONE     SERIALE  FERMATA  \
0  2023-02-20  08:45:00  2023-02-20 08:45:00 -4090155752     5072   
1  2023-02-05  07:46:00  2023-02-05 07:46:00 -4090155610     5026   

    DESCRIZIONE  TITOLO TICKET_CODE   DESCRIZIONE_TITOLO  
0      Chioggia   11209           7  BIGL RETE UNICA 75'  
1  Tronchetto F   11209           7  BIGL RETE UNICA 75'  
Min date:  2023-02-04
Max date:  2023-02-21


In [11]:
# This dataset must be cleaned before it can be used. The cleaning process is done in the following steps:
# 1. Execute the notebook 1_b_only_temp_cleaning.ipynb to clean the dataset deleting useless stamps once the algorithm has determined the minimum temporal gap between two consecutive stamps.

# The result of the cleaning process is a new dataset that must be used to obtain the dataset with geographical coordinates. This process is done in the following steps:
# 1. Execute the Notebook 3 AUX.ipynb to obtain the dataset with geographical coordinates.

In [12]:
# Open the dataset that has also the geo coordinates
# Find all txt files in the data folder
csv_file = mf.find_csv_files('data/processed/esportazioneCompleta')

# Choose a dataset from the list of txt files
selected_dataset = mf.choose_dataset(csv_file)

if selected_dataset:
    print(f"You selected the dataset {selected_dataset}")
else:
    print("No dataset selected.")

path  = selected_dataset

The following TXT file was found:
1. data/processed/esportazioneCompleta/df_esportazioneCompleta_GEO.csv
You selected the dataset data/processed/esportazioneCompleta/df_esportazioneCompleta_GEO.csv


In [13]:
df = pd.read_csv(path, header=0, sep=',')

# Save the name of the file in a variable for future use extracting the name of the file from the path
file_name = path.split('_')[1]
subfolder = file_name
print(f"File name: {file_name}")

# Display the first 5 rows of the dataframe
df.head()

# Convert the column 'DATA' to datetime format
df['DATA'] = pd.to_datetime(df['DATA'], format='%Y-%m-%d')

# Take only the first 1%
# df = df.iloc[:int(len(df) * 0.01)]

File name: esportazioneCompleta


In [14]:
df.head()

Unnamed: 0,DATA,ORA,DATA_VALIDAZIONE,SERIALE,FERMATA,DESCRIZIONE,TITOLO,TICKET_CODE,DESCRIZIONE_TITOLO,LATITUDE,LONGITUDE
0,2023-01-13,00:00:00,2023-01-13 00:00:00,40834866809772548,162,STAZIONE MES,12101,7,BIGL.AUT.75'MESTRE/LIDO-TSC,45.482675,12.231809
1,2023-01-13,00:00:00,2023-01-13 00:00:00,-3604990320,5049,ZATTERE,23301,5-STUD,MENS.STUDENTE RETE UNICA,45.42926,12.32628
2,2023-01-13,00:00:00,2023-01-13 00:00:00,-2824230951,5043,"S. TOMA' ""B""",23303,6-STUD,ABB STUD. RETEUNICA 12 MESI,45.435184,12.327917
3,2023-01-13,00:00:00,2023-01-13 00:00:00,40552750134805252,5013,S. MARCO-SAN,11101,7,"75'-TPL 8,64-COMVE0,86",45.491853,12.242548
4,2023-01-13,00:01:00,2023-01-13 00:01:00,-3604964420,6084,VENEZIA,11209,7,BIGL RETE UNICA 75',45.491853,12.242548


In [15]:
# Print the column names of the dataframe
print(df.columns)

Index(['DATA', 'ORA', 'DATA_VALIDAZIONE', 'SERIALE', 'FERMATA', 'DESCRIZIONE',
       'TITOLO', 'TICKET_CODE', 'DESCRIZIONE_TITOLO', 'LATITUDE', 'LONGITUDE'],
      dtype='object')


In [16]:
# For each stop, keep the number of visits for each ticket code
# Each stop is a point identified by the coordinates of the stop

with open('data/dictionaries/dict_ticket_codes.json') as f:
        ticket_codes = json.load(f)

print(ticket_codes.keys())

# Change 5-STUD, 6-STUD to STUD in the dataframe
df['TICKET_CODE'] = df['TICKET_CODE'].replace(['5-STUD', '6-STUD'], 'STUD')
# Change 5-WKRS, 6-WKRS to WKRS in the dataframe
df['TICKET_CODE'] = df['TICKET_CODE'].replace(['5-WKRS', '6-WKRS'], 'WKRS')
# Change 5-RET, 6-RET to RET in the dataframe
df['TICKET_CODE'] = df['TICKET_CODE'].replace(['5-RET', '6-RET'], 'RET')

# Print the unique ticket codes
print('The considered ticket codes are: ', df['TICKET_CODE'].unique())

# For each stop, store the number of visits for each ticket code
# Iterate over the stops dataframe and for each stop, store the number of visits for each ticket code
# Notice that a stop is a pair of coordinates (latitude, longitude)

# The columns of the dataframe are:
# ['DATA', 'ORA', 'DATA_VALIDAZIONE', 'SERIALE', 'FERMATA', 'DESCRIZIONE',
#        'TITOLO', 'TICKET_CODE', 'DESCRIZIONE_TITOLO', 'LATITUDE', 'LONGITUDE']

# For each stop, store the number of visits for each ticket code
# Iterate over the stops dataframe and for each stop, store the number of visits for each ticket code

# Create a dataframe of stops
df_stop = df[['LATITUDE', 'LONGITUDE', 'TICKET_CODE']]

df_stop.head()


dict_keys(['1', '2', '3', '4', '5', '5-STUD', '5-RET', '5-WKRS', '6', '6-STUD', '6-RET', '6-WKRS', '7', '8'])
The considered ticket codes are:  ['7' 'STUD' '5' '6' '1' '2' '3' '4' 'RET' 'WKRS']


Unnamed: 0,LATITUDE,LONGITUDE,TICKET_CODE
0,45.482675,12.231809,7
1,45.42926,12.32628,STUD
2,45.435184,12.327917,STUD
3,45.491853,12.242548,7
4,45.491853,12.242548,7


In [17]:
df_stop_count = df_stop.groupby(['LATITUDE', 'LONGITUDE', 'TICKET_CODE']).size().reset_index(name='COUNT')

df_stop_count.head(20)

Unnamed: 0,LATITUDE,LONGITUDE,TICKET_CODE,COUNT
0,45.22369,12.280678,1,843
1,45.22369,12.280678,2,143
2,45.22369,12.280678,3,138
3,45.22369,12.280678,4,171
4,45.22369,12.280678,5,5971
5,45.22369,12.280678,6,1401
6,45.22369,12.280678,7,8780
7,45.22369,12.280678,RET,283
8,45.22369,12.280678,STUD,921
9,45.22369,12.280678,WKRS,1


In [18]:
# Describe the column COUNT
df_stop_count['COUNT'].describe()

count       388.000000
mean      12751.188144
std       50457.274097
min           1.000000
25%         210.000000
50%         969.000000
75%        6336.000000
max      850218.000000
Name: COUNT, dtype: float64

In [19]:
# Pivot table for the ticket codes
df_stop_count = df_stop_count.pivot_table(index=['LATITUDE', 'LONGITUDE'], columns='TICKET_CODE', values='COUNT', fill_value=0)
df_stop_count.reset_index(inplace=True)

In [20]:
df_stop_count

TICKET_CODE,LATITUDE,LONGITUDE,1,2,3,4,5,6,7,RET,STUD,WKRS
0,45.22369,12.280678,843,143,138,171,5971,1401,8780,283,921,1
1,45.412113,12.359925,59,39,22,19,2,2,252,0,4,0
2,45.417992,12.368725,13264,10248,14902,7714,108504,79298,102281,4129,35630,1238
3,45.418373,12.258713,0,0,0,0,3,4,1,0,0,0
4,45.419617,12.35535,38,62,63,50,13,0,308,0,12,0
5,45.425579,12.332047,950,810,1365,907,628,131,4553,28,223,0
6,45.42672,12.337905,2304,2597,3983,1523,1080,235,6334,82,458,0
7,45.429008,12.315648,376,341,480,262,836,195,7259,170,439,0
8,45.429047,12.355934,2176,2084,3300,2560,8852,4509,18590,368,3634,4
9,45.42926,12.32628,5102,4574,6372,3661,10500,6656,21877,342,6342,14


In [21]:
def distance_between_stops (stop1: tuple, stop2: tuple) -> float:
    import geopy.distance
    # Compute the distance between two stops
    # stop1 and stop2 are the coordinates of the two stops
    # df_stop_count is the dataframe containing the stop names and coordinates
    # Return the distance between the two stops in km

    # Compute the distance between two stops
    distance = geopy.distance.distance(stop1, stop2).km
    
    return distance


def uses_between_stops (stop1: tuple, stop2: tuple, df_stop_selected_a: pd.DataFrame) -> int:
    # Calculate the similarity between two stops based on uses of the same ticket code
    # stop1 and stop2 are the coordinates of the two stops
    # df_stop_count is the dataframe containing the stop coordinates and the number of uses of each ticket code only for the specified stops
    # Return the similarity between the two stops based on uses of the same ticket code

    # The ticket codes are the columns of the dataframe
    ticket_code_list = df_stop_selected_a.columns.tolist()
    # Remove the first 3 columns (stop name, latitude and longitude)
    ticket_code_list = ticket_code_list[3:]
    print(ticket_code_list)

    # Initialize the similarity
    similarity = 0

    # For each ticket code
    # Stop 1 is the first row of the dataframe
    # Stop 2 is the second row of the dataframe
    stop1_row = df_stop_selected_a.iloc[0, 1:3]
    stop2_row = df_stop_selected_a.iloc[1, 1:3]
    # print('a')
    # print(stop1_row, stop2_row)
    # Calculate how much the two stops are similar based on the ticket code
    for ticket_code in ticket_code_list:
        # Get the number of uses of the ticket code for the two stops
        stop1_uses = df_stop_selected_a.loc[0, ticket_code]
        # print(stop1_uses)
        stop2_uses = df_stop_selected_a.loc[1, ticket_code]
        # print(stop2_uses)
        # Calculate the similarity between the two stops based on the ticket code
        similarity += abs(stop1_uses - stop2_uses)
        print ('Similarity: ', similarity)
    return similarity 


In [22]:
# Using df_stop_count, define a similarity measure between two stops
# Two stops are similar if they are close to each other and if they have a similar number uses in the same ticket code
#
def similarity_measure(stop1: tuple, stop2: tuple, df_stop_count: pd.DataFrame) -> float:
    """
        This function returns the similarity measure between two stops.
        :param stop1: the first stop
        :param stop2: the second stop
        :param df_stop_count: the dataframe of stop counts
        :return: the similarity measure between two stops
    """
    # Compute the distance between the two stops
    distance = distance_between_stops(stop1, stop2)

    # Compute the number of uses of the two stops in the same ticket code
    uses = uses_between_stops(stop1, stop2, df_stop_count)

    # Compute the similarity measure
    similarity = 1 / (1 + distance) * uses

    return similarity



In [23]:
point_1 = (45.223690, 12.280678)
point_2 = (45.426720, 12.337905)

# Create a dataframe with only point_1 and point_2
df_stop_selected = df_stop_count[(df_stop_count['LATITUDE'] == point_1[0]) | (df_stop_count['LATITUDE'] == point_2[0])]
df_stop_selected = df_stop_selected[(df_stop_selected['LONGITUDE'] == point_1[1]) | (df_stop_selected['LONGITUDE'] == point_2[1])]
df_stop_selected.reset_index(inplace=True, drop=True)
print(df_stop_selected)

uses_between_stops(point_1, point_2, df_stop_selected)


TICKET_CODE  LATITUDE  LONGITUDE     1     2     3     4     5     6     7  \
0            45.22369  12.280678   843   143   138   171  5971  1401  8780   
1            45.42672  12.337905  2304  2597  3983  1523  1080   235  6334   

TICKET_CODE  RET  STUD  WKRS  
0            283   921     1  
1             82   458     0  
['2', '3', '4', '5', '6', '7', 'RET', 'STUD', 'WKRS']
Similarity:  2454
Similarity:  6299
Similarity:  7651
Similarity:  12542
Similarity:  13708
Similarity:  16154
Similarity:  16355
Similarity:  16818
Similarity:  16819


16819

In [24]:
# Point 1 is the first point of df_stop_count, only LATITUDE and LONGITUDE 
point_1 = df_stop_count.iloc[0][['LATITUDE', 'LONGITUDE']]
# Point 2 is the last point of df_stop_count, only LATITUDE and LONGITUDE
point_2 = df_stop_count.iloc[-1][['LATITUDE', 'LONGITUDE']]
print(point_1)
print(point_2)
print(type(point_1))
print(type(point_2))
# Convert from pandas series to tuple
point_1 = tuple(point_1)
point_2 = tuple(point_2)
print(type(point_1))
print(type(point_2))
print('Distance between point 1 and point 2: ', distance_between_stops(point_1, point_2))

TICKET_CODE
LATITUDE     45.223690
LONGITUDE    12.280678
Name: 0, dtype: float64
TICKET_CODE
LATITUDE     45.504976
LONGITUDE    12.339106
Name: 42, dtype: float64
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'tuple'>
<class 'tuple'>
Distance between point 1 and point 2:  31.59517590771107


In [25]:
# Calculate distance between two points
point_1 = (45.223690, 12.280678)
point_2 = (45.426720, 12.337905)

distance = round (distance_between_stops (point_1, point_2), 4)
print (f"Distance between point 1 and point 2: {distance} km")


# Represent the distance between two points on a map with folium
# Create a map
m = folium.Map(location=[45.223690, 12.280678], zoom_start=10)

# Add a marker for point 1
folium.Marker(
    location=point_1,
    popup='Point 1',
    icon=folium.Icon(color='red', icon='info-sign')
).add_to(m)

# Add a marker for point 2
folium.Marker(
    location=point_2,
    popup='Point 2',
    icon=folium.Icon(color='red', icon='info-sign')
).add_to(m)

# Add a line between the two points
# folium.PolyLine(locations=[point_1, point_2], color='blue').add_to(m)
# Add the distance between the two points as a label to the line
# Do not use marker
folium.PolyLine(locations=[point_1, point_2], color='blue', tooltip=f"{distance} km").add_to(m)


# Display the map
m

Distance between point 1 and point 2: 23.0061 km


In [26]:
# use the similarity measure to cluster the stops
# Use an hierarchical clustering algorithm


def hierarchical_clustering(similarity_matrix: np.ndarray, threshold: float) -> list:
    # Create a list of clusters
    clusters = []
    # Create a list of clusters to be processed
    clusters_to_process = []
    # Create a list of clusters to be processed
    clusters_to_process.append(list(range(similarity_matrix.shape[0])))
    # Create a list of clusters to be processed
    while clusters_to_process:
        # Get the first cluster in the list
        cluster_to_process = clusters_to_process[0]
        # Remove the first cluster from the list
        clusters_to_process = clusters_to_process[1:]
        # If the cluster has only one element, add it to the list of clusters
        if len(cluster_to_process) == 1:
            clusters.append(cluster_to_process)
        else:
            # Compute the similarity between the cluster and the other clusters
            similarities = [similarity_matrix[cluster_to_process[0], cluster] for cluster in cluster_to_process[1:]]
            # Get the index of the cluster with the highest similarity
            index = np.argmax(similarities)
            # Get the similarity with the cluster with the highest similarity
            similarity = similarities[index]
            # If the similarity is higher than the threshold
            if similarity > threshold:
                # Get the index of the cluster with the highest similarity
                index += 1
                # Add the cluster to the list of clusters to be processed
                clusters_to_process.append(cluster_to_process[:index])
                # Add the cluster to the list of clusters to be processed
                clusters_to_process.append(cluster_to_process[index:])
            else:
                # Add the cluster to the list of clusters
                clusters.append(cluster_to_process)

    return clusters




In [27]:
# Create a similarity matrix considering the stops as points in a 2D space (latitude and longitude) and using the personalized similarity measure
similarity_matrix = np.zeros((len(df_stop_count), len(df_stop_count)))
for i in range(len(df_stop_count)):
    for j in range(len(df_stop_count)):
        if i == j:
            similarity_matrix[i, j] = 1
            continue
        point_1 = (df_stop_count.iloc[i]['LATITUDE'], df_stop_count.iloc[i]['LONGITUDE'])
        point_2 = (df_stop_count.iloc[j]['LATITUDE'], df_stop_count.iloc[j]['LONGITUDE'])
        # print(point_1, point_2)
        # print(type(point_1), type(point_2))
        
        df_stop_selected = df_stop_count[(df_stop_count['LATITUDE'] == point_1[0]) | (df_stop_count['LATITUDE'] == point_2[0])]
        df_stop_selected = df_stop_selected[(df_stop_selected['LONGITUDE'] == point_1[1]) | (df_stop_selected['LONGITUDE'] == point_2[1])]
        df_stop_selected.reset_index(inplace=True, drop=True)
        similarity_matrix[i, j] = similarity_measure(point_1, point_2, df_stop_selected)

# Standardize the similarity matrix
similarity_matrix = (similarity_matrix - similarity_matrix.min()) / (similarity_matrix.max() - similarity_matrix.min())

['2', '3', '4', '5', '6', '7', 'RET', 'STUD', 'WKRS']
Similarity:  104
Similarity:  220
Similarity:  372
Similarity:  6341
Similarity:  7740
Similarity:  16268
Similarity:  16551
Similarity:  17468
Similarity:  17469
['2', '3', '4', '5', '6', '7', 'RET', 'STUD', 'WKRS']
Similarity:  10105
Similarity:  24869
Similarity:  32412
Similarity:  134945
Similarity:  212842
Similarity:  306343
Similarity:  310189
Similarity:  344898
Similarity:  346135
['2', '3', '4', '5', '6', '7', 'RET', 'STUD', 'WKRS']
Similarity:  143
Similarity:  281
Similarity:  452
Similarity:  6420
Similarity:  7817
Similarity:  16596
Similarity:  16879
Similarity:  17800
Similarity:  17801
['2', '3', '4', '5', '6', '7', 'RET', 'STUD', 'WKRS']
Similarity:  81
Similarity:  156
Similarity:  277
Similarity:  6235
Similarity:  7636
Similarity:  16108
Similarity:  16391
Similarity:  17300
Similarity:  17301
['2', '3', '4', '5', '6', '7', 'RET', 'STUD', 'WKRS']
Similarity:  667
Similarity:  1894
Similarity:  2630
Similarity: 

In [28]:
# Print the similarity matrix
similarity_matrix

array([[0.00000000e+00, 1.25029053e-03, 2.39371171e-02, ...,
        7.54922686e-02, 8.67790509e-04, 8.05475872e-04],
       [1.25029053e-03, 0.00000000e+00, 3.05346220e-01, ...,
        1.71884046e-01, 1.15893912e-04, 1.52283682e-03],
       [2.39371171e-02, 3.05346220e-01, 0.00000000e+00, ...,
        1.27919674e-01, 5.81337312e-02, 5.28080492e-02],
       ...,
       [7.54922686e-02, 1.71884046e-01, 1.27919674e-01, ...,
        0.00000000e+00, 1.65730797e-01, 2.70285174e-01],
       [8.67790509e-04, 1.15893912e-04, 5.81337312e-02, ...,
        1.65730797e-01, 0.00000000e+00, 2.38208179e-03],
       [8.05475872e-04, 1.52283682e-03, 5.28080492e-02, ...,
        2.70285174e-01, 2.38208179e-03, 0.00000000e+00]])

In [29]:
clusters = hierarchical_clustering(similarity_matrix, 0.009)
print(clusters)
print(len(clusters))


[[42], [23], [40], [41], [0, 1], [31], [32, 33, 34], [39], [2], [3], [28], [35], [24], [25], [26], [27], [29], [30], [36, 37], [38], [4, 5], [6, 7], [21], [22], [8, 9], [10, 11], [12, 13], [14], [15], [16], [17, 18], [19], [20]]
33


In [32]:
# Apply HDBSCAN with a personalied distance matrix
import hdbscan
HDSBCAN = hdbscan.HDBSCAN(metric='precomputed', min_cluster_size=5)
HDSBCAN.fit(similarity_matrix)
HDSBCAN.labels_


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/matteograzioso/Library/Python/3.11/lib/python/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/d8/fzh8cxg9093fp0_k92hwz6yr0000gn/T/ipykernel_4235/919012457.py", line 2, in <module>
    import hdbscan
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/hdbscan/__init__.py", line 1, in <module>
    from .hdbscan_ import HDBSCAN, hdbscan
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/hdbscan/hdbscan_.py", line 40, in <module>
    FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics + ["cosine", "arccos"]
                   ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
TypeError: unsupported operand type(s) for +: 'builtin_function_or_method' and 'builtin_function_or_method'

During handling of the above exception, another exception occurred:

Traceback (most r

In [30]:
# Create a dataframe with the latitude and longitude of the stops with the cluster number
df_clustered_stops = df_stop_count[['LATITUDE', 'LONGITUDE']]
# Assing the cluster number to each stop
for i in range(len(df_clustered_stops)):
    df_clustered_stops.loc[i, 'CLUSTER'] = clusters[i]
df_clustered_stops = df_clustered_stops.reset_index(drop=True)
df_clustered_stops.head()


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/matteograzioso/Library/Python/3.11/lib/python/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/d8/fzh8cxg9093fp0_k92hwz6yr0000gn/T/ipykernel_4235/3919025548.py", line 5, in <module>
    df_clustered_stops.loc[i, 'CLUSTER'] = clusters[i]
    ~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/indexing.py", line 818, in __setitem__
    iloc._setitem_with_indexer(indexer, value, self.name)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/indexing.py", line 1795, in _setitem_with_indexer
    self._setitem_with_indexer_split_path(indexer, value, name)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/indexing.py", line 1850, in _setitem_with_indexer_split_path
  

In [31]:
# Represent the clusters in a 2D plot
# Create a map of the clusters
map = folium.Map(location=[45.464664, 9.188540], zoom_start=12)

# Create a color for each cluster
rainbow = ['red', 'blue', 'green', 'purple', 'orange', 'darkred',
              'lightred', 'beige', 'darkblue', 'darkgreen', 'cadetblue']

# Add the markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_clustered['LATITUDE'], df_clustered['LONGITUDE'], df_clustered['ADDRESS'], df_clustered['CLUSTER']):

SyntaxError: incomplete input (475773154.py, line 11)