In [1]:
import os
from math import sin, cos, radians, sqrt, asin
from pyspark.sql.functions import broadcast
from pyspark.sql.types import *
from pyspark.sql.functions import udf, col
from pyspark.sql.window import Window as w
from pyspark.sql.functions import dense_rank
from pyspark.sql import SparkSession

In [2]:
def distance(lat1, lon1, lat2, lon2):
    '''
    returns distance based on Haversine formula
    Note that this formula has up to 0.5% error 
    due to the Earth radius variance between Pole and Equator
    '''
    # The Earth Radius in Km
    r = 6371
    lat1, lon1, lat2, lon2 = list(map(radians, [lat1,lon1,lat2,lon2]))    
    d = 2*r*asin(sqrt(sin((lat2-lat1)/2)**2 + cos(lat1)*cos(lat2)*sin((lon2 - lon1)/2)**2))
    
    return d

udf_distance = udf(distance, DoubleType())

In [3]:
def col_trim(col:list):
    return list(map(str.strip, col))

In [4]:
spark = SparkSession.builder.master('local').getOrCreate()

In [5]:
df = spark.read.options(
    header='True',
    inferSchema='True',
    delimiter=',',
).csv(os.path.expanduser('DataSample.csv'))

In [6]:
df.show(10, truncate=False)

+-------+-----------------------+-------+--------+---------+--------+---------+
|_ID    | TimeSt                |Country|Province|City     |Latitude|Longitude|
+-------+-----------------------+-------+--------+---------+--------+---------+
|4516516|2017-06-21 00:00:00.143|CA     |ON      |Waterloo |43.49347|-80.49123|
|4516547|2017-06-21 18:00:00.193|CA     |ON      |London   |42.9399 |-81.2709 |
|4516550|2017-06-21 15:00:00.287|CA     |ON      |Guelph   |43.5776 |-80.2201 |
|4516600|2017-06-21 15:00:00.307|CA     |ON      |Stratford|43.3716 |-80.9773 |
|4516613|2017-06-21 15:00:00.497|CA     |ON      |Stratford|43.3716 |-80.9773 |
|4516693|2017-06-21 14:00:00.597|CA     |ON      |Kitchener|43.4381 |-80.5099 |
|4516771|2017-06-21 10:00:00.873|CA     |ON      |Sarnia   |42.961  |-82.373  |
|4516831|2017-06-21 12:00:00.950|CA     |ON      |London   |43.0091 |-81.1765 |
|4516915|2017-06-21 15:00:01.310|CA     |ON      |London   |43.0091 |-81.1765 |
|4516953|2017-06-21 16:00:01.700|CA     

In [7]:
df.printSchema()

root
 |-- _ID: integer (nullable = true)
 |--  TimeSt: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Province: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)



In [8]:
# trim col names
df = df.toDF(*col_trim(df.columns))

In [9]:
df = df.dropDuplicates(['TimeSt','Latitude','Longitude'])

In [10]:
df_poi = spark.read.options(
    header='True',
    inferSchema='True',
    delimiter=',',
).csv(os.path.expanduser('POIList.csv'))

In [11]:
df_poi.show(10,truncate=False)

+-----+---------+-----------+
|POIID| Latitude|Longitude  |
+-----+---------+-----------+
|POI1 |53.546167|-113.485734|
|POI2 |53.546167|-113.485734|
|POI3 |45.521629|-73.566024 |
|POI4 |45.22483 |-63.232729 |
+-----+---------+-----------+



In [12]:
df_poi.printSchema()

root
 |-- POIID: string (nullable = true)
 |--  Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)



In [13]:
# trim col names
df_poi = df_poi.toDF(*col_trim(df_poi.columns))

In [14]:
# building cross join table and calculate distance for every possile pairs
df_crossjoin = df.crossJoin(
    broadcast(
        df_poi.select(
            'POIID', 
            col('Latitude').alias('POI_Lat'), 
            col('Longitude').alias('POI_Lon')
        )
    )
)\
.withColumn(
    'Distance',
    udf_distance(
        'Latitude',
        'Longitude',
        'POI_Lat',
        'POI_Lon'
    )
)

In [15]:
# rank the distances for each user and pick the colsest one
result = df_crossjoin\
.withColumn(
    'distance_rank', 
    dense_rank()\
    .over(
        w.partitionBy('_ID')\
        .orderBy(col('Distance'))
    )
)\
.filter(col('distance_rank') == 1)\
.drop('distance_rank')

In [16]:
result.show(10)

+-------+--------------------+-------+--------+---------+--------+---------+-----+---------+----------+-----------------+
|    _ID|              TimeSt|Country|Province|     City|Latitude|Longitude|POIID|  POI_Lat|   POI_Lon|         Distance|
+-------+--------------------+-------+--------+---------+--------+---------+-----+---------+----------+-----------------+
|4516516|2017-06-21 00:00:...|     CA|      ON| Waterloo|43.49347|-80.49123| POI3|45.521629|-73.566024|593.4134412324522|
|4516547|2017-06-21 18:00:...|     CA|      ON|   London| 42.9399| -81.2709| POI3|45.521629|-73.566024|677.3096549572074|
|4516550|2017-06-21 15:00:...|     CA|      ON|   Guelph| 43.5776| -80.2201| POI3|45.521629|-73.566024|569.6477373989126|
|4516600|2017-06-21 15:00:...|     CA|      ON|Stratford| 43.3716| -80.9773| POI3|45.521629|-73.566024|634.7339960242028|
|4516613|2017-06-21 15:00:...|     CA|      ON|Stratford| 43.3716| -80.9773| POI3|45.521629|-73.566024|634.7339960242028|
|4516693|2017-06-21 14:0