# Use Sedona and orsm to calculate route distance and duration

In previous tutorials, we have calculated the bird view distance between two points. If we need to real route distance and duration, the default Sedona function can't do that.
 
In this tutorial, we will use sedona and [orsm-backend](https://github.com/Project-OSRM/osrm-backend) to find best route to destination, then calculate distance and duration.

> a doc on how to deploy orsm https://github.com/pengfei99/OSRM-deployement

The dataset is always the French commune data set released by INSEE.

Step1: calculate the centroid of each french commune
Step2: convert the centroid (geometry point) to a GPS coordinates(double),OSRM-backend exposes a rest api
Step3: Build a start point, end point matrix
Step4: Create a spark udf
Step5: Use the udf to calculate the distance and duration 

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split


from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

import requests

In [2]:
spark = SparkSession.builder.master("local[2]").appName("DemoG4Health").getOrCreate()

24/06/04 11:26:46 WARN Utils: Your hostname, pengfei-Virtual-Machine resolves to a loopback address: 127.0.1.1; using 10.50.2.80 instead (on interface eth0)
24/06/04 11:26:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/06/04 11:26:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Step 1: Build the source and destination commune matrix

In [3]:
converted_centroid_path = "/home/pengfei/data_set/kaggle/geospatial/converted_centroid_of_french_commune"
converted_centroid_df=spark.read.parquet(converted_centroid_path)

                                                                                

In [4]:
insee_code_list = ["75056"]
source_df = converted_centroid_df.filter(col("insee").isin(insee_code_list))
source_df.show()

                                                                                

+-----+-----+------------------+-----------------+
|  nom|insee|         longitude|         latitude|
+-----+-----+------------------+-----------------+
|Paris|75056|2.3428764301940275|48.85662219553845|
+-----+-----+------------------+-----------------+


In [22]:
commune_matrix_df = (source_df.alias("add1")
                  .join(converted_centroid_df.alias("add2"),col("add1.insee")!=col("add2.insee"),"inner")
                  .select(col("add1.longitude").alias("source_long"),col("add1.latitude").alias("source_lat"),col("add1.insee").alias("source_insee"),col("add1.nom").alias("source_nom"),col("add2.longitude").alias("dest_long"),col("add2.latitude").alias("dest_lat"),col("add2.insee").alias("dest_insee"),col("add2.nom").alias("dest_nom"))).limit(1000)
commune_matrix_df.show()

+------------------+-----------------+------------+----------+------------------+------------------+----------+-----------------+
|       source_long|       source_lat|source_insee|source_nom|         dest_long|          dest_lat|dest_insee|         dest_nom|
+------------------+-----------------+------------+----------+------------------+------------------+----------+-----------------+
|2.3428764301940275|48.85662219553845|       75056|     Paris| 9.338150861836196|42.374292014354154|     2B222|     Pie-d'Orezza|
|2.3428764301940275|48.85662219553845|       75056|     Paris| 9.235357777014519| 42.37887024991088|     2B137|             Lano|
|2.3428764301940275|48.85662219553845|       75056|     Paris| 9.302107656444328| 42.36875223806091|     2B051|           Cambia|
|2.3428764301940275|48.85662219553845|       75056|     Paris|  9.26661425039706|42.375563316535825|     2B106|            Érone|
|2.3428764301940275|48.85662219553845|       75056|     Paris|  9.33384508224219|42.641774

In [23]:
commune_matrix_df = commune_matrix_df.select(col("source_long").cast('string').alias("source_long"),
                         col("source_lat").cast('string').alias("source_lat"),
                         col("dest_long").cast('string').alias("dest_long"),
                          col("dest_lat").cast('string').alias("dest_lat"),
                          "source_insee","source_nom","dest_insee","dest_nom")
commune_matrix_df.printSchema()

root
 |-- source_long: string (nullable = true)
 |-- source_lat: string (nullable = true)
 |-- dest_long: string (nullable = true)
 |-- dest_lat: string (nullable = true)
 |-- source_insee: string (nullable = true)
 |-- source_nom: string (nullable = true)
 |-- dest_insee: string (nullable = true)
 |-- dest_nom: string (nullable = true)


In [7]:
## calculate route and duration

In [24]:
def get_route(lat_start: str, long_start: str, lat_end: str, long_end: str,
              show_steps: str = "false") -> dict:
    host = "maps-api.casd.local"
    start_point = f"{long_start},{lat_start}"
    end_point = f"{long_end},{lat_end}"
    # Define the URL
    url = f"http://{host}/route/v1/driving/{start_point};{end_point}?steps={show_steps}"

    # Make the GET request
    response = requests.get(url, verify=False, timeout=10)
    json_response = None
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Print the response content
        json_response = response.json()
    else:
        print("Error:", response.status_code)
    return json_response


def parse_route_json(input_route: dict) -> (str, str):
    f_route = input_route['routes'][0]
    if f_route:
        # the raw distance is in meter
        distance = f_route["distance"]
        print(distance)
        # the raw duration is in second
        # the returned duration is in minutes
        duration = round((f_route["duration"] / 60), 2)
        print(duration)
    else:
        distance = 0
        duration = 0
    return distance, duration


def calculate_distance_duration(lat_start: str, long_start: str, lat_end: str, long_end: str) -> (float, float):
    route = get_route(lat_start, long_start, lat_end, long_end)
    return parse_route_json(route)


def calculate_distance_duration_str(lat_start: str, long_start: str, lat_end: str, long_end: str) -> str:
    distance, duration = calculate_distance_duration(lat_start, long_start, lat_end, long_end)
    return f"{distance};{duration}"


@udf(returnType=StringType())
def get_distance_duration(lat_start: str, long_start: str, lat_end: str, long_end: str):
    return calculate_distance_duration_str(lat_start, long_start, lat_end, long_end)

In [25]:
start_long = "2.309167"
start_lat = "48.819552"
end_long = "2.467290"
end_lat = "48.758568"

In [26]:
route_json = get_route(start_lat,start_long,end_lat,end_long)
print(type(route_json))
print(route_json)


<class 'dict'>
{'code': 'Ok', 'routes': [{'geometry': 'g`~hHc_bM^eDiRk\\zVinBrIqSdBy_AcIge@vHi\\bAyc@o_@}_B_W_i@{Ik`@tWua@dIaS`U{rAnBemAs@}oAbEan@lGeK`RgGfh@p@nUfFbi@b_@dY~Iz_@fBxj@oDrl@ee@vNgSbN__@lQk{@lLsRpL_I', 'legs': [{'steps': [], 'summary': '', 'weight': 1115.1, 'duration': 1115.1, 'distance': 18677.9}], 'weight_name': 'routability', 'weight': 1115.1, 'duration': 1115.1, 'distance': 18677.9}], 'waypoints': [{'hint': 'zrIqgdCyKoEIAAAARwAAANsAAAAAAAAAD1xjQMnb60GQULZCAAAAAAgAAABHAAAA2wAAAAAAAABtIgAAEDwjAMfs6AIvPCMAYO3oAgkAHwbkNR0k', 'distance': 17.166158355, 'name': '', 'location': [2.309136, 48.819399]}, {'hint': 'V5UEgP___38OAAAAMQAAAJQBAACcAgAAzx6vQSYxWULcY0pElQ6ZRA4AAAAxAAAAlAEAAJwCAABtIgAAja8lAD8B6ALapSUAKP_nAhIAHwrkNR0k', 'distance': 191.955244684, 'name': '', 'location': [2.469773, 48.759103]}]}




In [27]:
dis1,dur1= parse_route_json(route_json)

18677.9
18.58


In [28]:
res1 = calculate_distance_duration_str(start_lat,start_long,end_lat,end_long)

18677.9
18.58




In [29]:
distance_duration_df = (commune_matrix_df.withColumn("distance_duration", get_distance_duration(col("source_lat"), col("source_long"),col("dest_lat"),col("dest_long"))).select(
            "source_nom", "source_insee", "dest_nom", "dest_insee", "distance_duration")
                        .withColumn("distance(meter)",split(col("distance_duration"),";")[0])
                        .withColumn("duration(minutes)", split(col("distance_duration"), ";")[1]))


In [30]:
distance_duration_df.cache()
distance_duration_df.show(5)


1189597.6:>                                                         (0 + 1) / 1]
919.4
1176819.3
904.53
1182215.2
911.62
1177287
905.13
1168612.9
903.37
1195574.3
933.4
1171248.2
902.4
1182214.5
909.74
1193160
921.69
1211285.7
947.56
1211476.6
947.28
1208214
942.75
1210936.9
935.97
1190499.2
918.79
891087.5
570.91
854347.5
572.28
883699.8
552.93
866812.3
585.27
852926.9
570.39
856485.2
574.51
872858.7
539.38
864727.8
582.57
918302.9
600.31
846535.2
570.45
1219889.4
950.85
1264849
993.44
1279582.5
1010.4
1158452.9
888.52
1179705.5
905.68
1174473.1
911.97
891584.7
569.18
877337.7
548.93
878283.8
547.57
905238.1
585.2
904018
585.21
889850
568.17
899129
585.05
897467.8
576.21
906612.3
585.55
910247.2
589.2
919236.5
601.99
909856.7
587.61
928096.4
627.13
869944.6
591.84
867857.6
587.64
853374.5
571.0
848093.8
566.3
857508.7
578.05
864287.3
583.79
880348.2
553.06
860974.1
592.59
857418.7
578.89
837755.4
554.32
864151.7
587.74
856285.9
580.87
844614.5
562.11
853191
571.9
854388.2
572.39
87933

+----------+------------+------------+----------+-----------------+---------------+-----------------+
|source_nom|source_insee|    dest_nom|dest_insee|distance_duration|distance(meter)|duration(minutes)|
+----------+------------+------------+----------+-----------------+---------------+-----------------+
|     Paris|       75056|Pie-d'Orezza|     2B222|  1189597.6;919.4|      1189597.6|            919.4|
|     Paris|       75056|        Lano|     2B137| 1176819.3;904.53|      1176819.3|           904.53|
|     Paris|       75056|      Cambia|     2B051| 1182215.2;911.62|      1182215.2|           911.62|
|     Paris|       75056|       Érone|     2B106|   1177287;905.13|        1177287|           905.13|
|     Paris|       75056|      Oletta|     2B185| 1168612.9;903.37|      1168612.9|           903.37|
+----------+------------+------------+----------+-----------------+---------------+-----------------+



402.34
610617.5
388.38
652999
426.94
635363.5
410.27
                                                                                


# Read from the existing file

In [17]:

prod_result_file_path = "/home/pengfei/data_set/fr_commune_distance/duration_prod_final"

df = spark.read.parquet(prod_result_file_path)
df.show()

                                                                                

24/05/17 13:41:05 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
+----------+--------------------+----------+---------------+-----------------+------------+
|source_nom|            dest_nom|dest_insee|distance(meter)|duration(minutes)|source_insee|
+----------+--------------------+----------+---------------+-----------------+------------+
|  Docelles|  Marignac-Lasclares|     31317|       932124.3|           596.25|       88135|
|  Docelles|Guigneville-sur-E...|     91293|       391917.3|           267.23|       88135|
|  Docelles|       Esmery-Hallon|     80284|       419241.2|           296.73|       88135|
|  Docelles|          Foulangues|     60249|       472930.4|           324.05|       88135|
|  Docelles|               Sassy|     14669|       657827.7|           434.03|       88135|
|  Docelles|        Saint-Cast

In [33]:
total_row = df.count()
print(f"total row: {total_row}")



total row: 382326852


                                                                                

In [32]:
source_commune = "92049"
paris = "75056"
dest_commune = "91293"
df.filter((col("source_insee")==source_commune) & (col("dest_insee")==dest_commune)).show(1, truncate=False)

+----------+-----------------------+----------+---------------+-----------------+------------+
|source_nom|dest_nom               |dest_insee|distance(meter)|duration(minutes)|source_insee|
+----------+-----------------------+----------+---------------+-----------------+------------+
|Montrouge |Guigneville-sur-Essonne|91293     |53893.8        |49.4             |92049       |
+----------+-----------------------+----------+---------------+-----------------+------------+


In [19]:
code_insee_df = df.select("source_insee").distinct()

In [20]:
commune_count = code_insee_df.count()

                                                                                

In [21]:
print(commune_count)

10934


# Do a diff 
If you want to do a diff on the finished distance calculation and the all code, you can use the below code

In [5]:
# step1: get the full code list
full_code_df=converted_centroid_df.select("insee").distinct()

In [6]:
full_code_df.count()

                                                                                

34955

In [7]:
full_code_df.show(5)

+-----+
|insee|
+-----+
|2B226|
|97128|
|48063|
|07200|
|62646|
+-----+


In [8]:
full_code_list = full_code_df.rdd.flatMap(lambda x: x).collect()

                                                                                

In [9]:
print(full_code_list)

['2B226', '97128', '48063', '07200', '62646', '05107', '38252', '07218', '38271', '38433', '07342', '38223', '07243', '43085', '71047', '59467', '51063', '02053', '77303', '02070', '55456', '70531', '39103', '70078', '88140', '88138', '70170', '54008', '88311', '25032', '25555', '57496', '88447', '57483', '57051', '90022', '57215', '57112', '57085', '57569', '67525', '68285', '21331', '39457', '55371', '67074', '88277', '68171', '68325', '67196', '21249', '55498', '55321', '57464', '10436', '39581', '39458', '39350', '55445', '70097', '21452', '52206', '58185', '69182', '21248', '51550', '51244', '21259', '69295', '10096', '58197', '10351', '02527', '89274', '02291', '58163', '59652', '59370', '59169', '59072', '26082', '04138', '07283', '07038', '05176', '05163', '30002', '26005', '55089', '83101', '26112', '48147', '24504', '86180', '32275', '61304', '2A279', '72194', '16320', '24114', '17427', '77371', '80258', '09231', '31432', '47140', '76686', '16250', '09276', '09120', '27248', 

In [17]:
# step2: get the finished code list
def read_list_from_file(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()
        lines = [line.rstrip('\n') for line in lines]
        return lines


def get_code_list():
    parent_dir = "/home/pengfei/data_set/fr_commune_distance/code_split_test"
    start = 1
    end = 127

    code_list = []

    for i in range(start, end + 1):
        filename = f"{parent_dir}/part_{i}.txt"
        lines_read = read_list_from_file(filename)
        # print(f"file name: {filename}")
        # print("Contents of the file:")
        for line in lines_read:
            code = line.strip()
            code_list.append(code)
    return code_list


part_list = get_code_list()

In [11]:
# step3: get the diff list
diff_list = list(set(full_code_list) - set(part_list))

In [18]:
print(len(diff_list))

274


In [13]:
# step4: write the diff code list to files
def split_list_into_parts(list_to_split, num_parts):
    # Calculate the size of each part
    part_size = len(list_to_split) // num_parts

    # Split the list into parts
    parts = [list_to_split[i * part_size:(i + 1) * part_size] for i in range(num_parts)]

    # Handle the case where the list cannot be evenly divided
    if len(list_to_split) % num_parts != 0:
        remaining = list_to_split[num_parts * part_size:]
        for i, item in enumerate(remaining):
            parts[i].append(item)

    return parts

In [14]:
def write_list_to_file(filename, lines):
    with open(filename, 'w') as file:
        for line in lines:
            file.write(line + '\n')

In [16]:
parts = split_list_into_parts(diff_list, 2)
parent_dir = "/tmp/code_split_test"

# Write each part to a separate file
for i, part in enumerate(parts):
    filename = f"{parent_dir}/part_{i}.txt"
    write_list_to_file(filename, part)
    print(f"Part {i} has been written to {filename}.")

Part 0 has been written to /tmp/code_split_test/part_0.txt.
Part 1 has been written to /tmp/code_split_test/part_1.txt.
