# Using Unsupervised Learning to plan Paris Vacation - clustering

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

try:
    import contextily as cx
except ImportError as error:
    !pip install contextily==1.2.0;
    import contextily as cx


!pip install haversine
import haversine as hs
import folium

try:
    import networkx as nx
except ImportError as error:
    !pip install networkx;
    import networkx as nx
import networkx.algorithms.approximation as nx_app

sns.set_theme()




## Modélisation

### Choix utilisateurs


In [4]:
import pandas as pd
from io import StringIO

u_start_point = (48.8672391, 2.3210898) # Hotel Crillon place de la Concorde
u_nb_jour = 7
u_moyen_mobilite = "Metro" # Marche/ Velo / Voiture
u_categorie = None
u_nb_pts_max = 8 # temps min de visite 60 minutes sur 8h soit ... 8

csv_remarkable = """
id, name, latitude, longitude
678296,Tour Eiffel,48.85836,2.294543
682672,Arc de triomphe,48.873757,2.295909
4679111,Cathédrale Notre-Dame de Paris,48.85267,2.349292
705679,Centre Pompidou,48.860713,2.352254
705742,Galeries nationales du Grand Palais,48.8659,2.313395
700946,Musée du Louvre,48.861347,2.335457
680785,Observatoire panoramique de la Tour Montparnasse,48.842162,2.322114
696931,Palais Garnier,48.871663,2.331864
776370,Parc des Buttes-Chaumont,48.876913,2.381105
697984,Petit Palais - Musée des Beaux Arts de la Ville de Paris,48.865895,2.313805
697946,Philharmonie de Paris - Cité de la musique,48.889306,2.393807
"""
poi_remarkable = pd.read_csv(StringIO(csv_remarkable), sep=",", dtype=str)


In [5]:
poi_remarkable.head(15)

Unnamed: 0,id,name,latitude,longitude
0,678296,Tour Eiffel,48.85836,2.294543
1,682672,Arc de triomphe,48.873757,2.295909
2,4679111,Cathédrale Notre-Dame de Paris,48.85267,2.349292
3,705679,Centre Pompidou,48.860713,2.352254
4,705742,Galeries nationales du Grand Palais,48.8659,2.313395
5,700946,Musée du Louvre,48.861347,2.335457
6,680785,Observatoire panoramique de la Tour Montparnasse,48.842162,2.322114
7,696931,Palais Garnier,48.871663,2.331864
8,776370,Parc des Buttes-Chaumont,48.876913,2.381105
9,697984,Petit Palais - Musée des Beaux Arts de la Vill...,48.865895,2.313805


In [3]:

from source.databases import create_graph, connect_neo4j
from dotenv import load_dotenv
load_dotenv()


coordinates_list = []

# Iterate over the DataFrame rows and extract latitude and longitude values
for index in range(len(poi_remarkable)):
    if index >= u_nb_jour:
        break
    latitude = float(poi_remarkable.iloc[index, 2])
    longitude = float(poi_remarkable.iloc[index, 3])

    coordinates_list.append([latitude, longitude])


query = f"""
CALL gds.beta.kmeans.stream('mustseenodes', {{
  nodeProperty: 'coord',
  k: {u_nb_jour},
  seedCentroids: {coordinates_list}
}})
YIELD nodeId, communityId
RETURN gds.util.asNode(nodeId).id AS poi_id, gds.util.asNode(nodeId).name AS name, communityId as day
ORDER BY communityId, name ASC
"""
print(query)
#create_graph(query)



CALL gds.beta.kmeans.stream('mustseenodes', {
  nodeProperty: 'coord',
  k: 7,
  seedCentroids: [[48.85836, 2.294543], [48.873757, 2.295909], [48.85267, 2.349292], [48.860713, 2.352254], [48.8659, 2.313395], [48.861347, 2.335457], [48.842162, 2.322114]]
})
YIELD nodeId, communityId
RETURN gds.util.asNode(nodeId).id AS poi_id, gds.util.asNode(nodeId).name AS name, communityId as day
ORDER BY communityId, name ASC




### Constantes

In [10]:
#Definition des paramètres d'itinéraires
MAX_TIME_PER_DAY = 480 #8h * 60 min

MAX_KM_BY_TRANSPORT = {
    "Marche": 5, #10 km Aller / Retour
    "Velo": 10, #20 km Aller / Retour
    "Metro": 15, #30 km Aller / Retour
    "Voiture": 60 #120 km Aller / Retour
}

# Gestion de la distance max
nb_km_max = MAX_KM_BY_TRANSPORT[u_moyen_mobilite]

# Couleurs utilisées par folium pour les markers
FOLIUM_COLORS = [
    'red','blue', 'green', 'purple', 'orange', 'darkred',
    'lightred', 'beige', 'darkblue', 'darkgreen', 'cadetblue',
    'darkpurple', 'white', 'pink', 'lightblue', 'lightgreen', 'gray', 'black', 'lightgray'
]
# Zoom a appliquer sur la carte
ZOOM_LVL = {
    'Metro': 12,
    'Velo': 15,
    'Marche': 20,
    'Voiture': 10
}

## Fonctions


In [11]:
# Declaration des fonction
def plot_markers(dataframe, map_to_plot, color_palette = FOLIUM_COLORS, icon='star'):
    """
        Plot markers on a foilum map using coord column on a dataframe and a specified color palette
    """
    for _idx, row in dataframe.iterrows():
        color_idx = row["Jour"] - 1
        max_colors_idx = len(color_palette) - 1
        if color_idx > max_colors_idx:
            color_idx = color_idx - max_colors_idx
        color = color_palette[color_idx]
        icon_color = 'dimgray' if color == 'white' else 'white'
        folium.Marker(
            location= list(row.Coord),
            popup= f"<h5>Jour {row['Jour']}</h5><p>{row['name']}</p>",
            icon= folium.Icon(color= color, icon_color= icon_color, icon=icon)
        ).add_to(map_to_plot)

def plot_day_path(dataframe, map_to_plot, color):
    """
        Compute shortest path by creating a graph from coordonates and then plot it on a folium map in. the selected color
    """
    G = nx.Graph()
    nodes = np.arange(0, len(dataframe))
    # nodes -> array([0, 1, 2, 3, 4])
    G.add_nodes_from(nodes)

    for i in nodes:
        for j in nodes:
            if i!=j:
                G.add_edge(i, j)

    #Create a dictionary of node and coordinate of each state for positions
    positions = {node:coordinate for node, coordinate in zip(nodes, dataframe.Coord)}

    #Create a dictionary of node and capital for labels
    labels = {node:name for node, name in zip(nodes, dataframe['name'])}

    cycle = nx_app.christofides(G, weight="weight")

    folium_coordinates = []
    for lat,lon in dataframe.Coord:
        folium_coordinates.append([lat,lon])

    route = [] # u_start_point
    for stop in cycle:
        route.append(folium_coordinates[stop])
    folium.PolyLine(route, color = color, weigth = 0.5).add_to(map_to_plot)

def result_df(result):
    return pd.DataFrame(result['Jour'].value_counts().sort_index()).rename(columns= {'Jour': 'nombre POI'})


### Application des paramètres utilisateurs à l'ensemble de points


### Carte points par jour


In [19]:
import source.databases as gdb
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.


gds = gdb.connect_gds()

gds.run_cypher("""
match (:POI)-[r:DISTANCE]->(:POI)
with collect(r) as routes, max(r.distance) as maxDistance
foreach(route in routes | set route.weight = (maxDistance + 1) - route.distance)
""")

try:
    gds.graph.drop('mustseen')
except:
    pass

G_routes, result = gds.graph.project('mustseen',
                                         "MustSeen",

                                         {"DISTANCE":
                                              {"orientation": "UNDIRECTED",
                                               "aggregation": "MAX"}
                                          }, relationshipProperties="weight")




ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `gds.graph.project`: Caused by: java.lang.IllegalArgumentException: A graph with name 'mustseen' already exists.}

In [17]:
knn = gds.beta.kmeans.stream('mustseenodes', {
    "nodeProperty": 'weight',
    "k": 7,
    "randomSeed": "42"
})

knn

TypeError: StreamModeRunner.__call__() takes 2 positional arguments but 3 were given

In [7]:
gds.wcc.mutate(G_routes, mutateProperty = 'componentId')

gds.graph.writeNodeProperties(G_routes, ['componentId'])



writeMillis                     49
graphName                 mustseen
nodeProperties       [componentId]
propertiesWritten               81
Name: 0, dtype: object

In [8]:
gds.run_cypher("MATCH (p:POI) RETURN p.componentId as componentId, count(*) as nodeCount ORDER BY count(*) DESC limit 1")

Unnamed: 0,componentId,nodeCount
0,,1999


In [9]:
G_connected_poi, result = gds.beta.graph.project.subgraph("connected-pois", G_routes, "n.componentId = 0", "*")

In [13]:

X = kmean_places_to_visit[['latitude','longitude']].values
predictions = kmeans_1.fit_predict(X)
clustered = pd.concat([kmean_places_to_visit.reset_index(),
                       pd.DataFrame({'Cluster':predictions})],
                      axis=1)

clustered["Jour"] = clustered["Cluster"] + 1

kmeans_map = folium.Map(location = u_start_point , tiles = "OpenStreetMap", zoom_start = ZOOM_LVL[u_moyen_mobilite])

plot_markers(clustered, map_to_plot=kmeans_map)

kmeans_map

  super()._check_params_vs_input(X, default_n_init=10)
