In [None]:
import pyspark 
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
import numpy as np
from pyspark.sql.functions import col, struct, spark_partition_id
import math
import pandas as pd
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

#read the csv file and fix the headers
df = spark.read.csv("C:/Users/nikos/Desktop/partitiondata.csv", header=False).toDF("Id", "Text", "Latitude", "Longitude")

#convert latitude and longitude to float type
dataset = df.withColumn("Latitude", regexp_replace('Latitude', '["(]', '').cast("float")).withColumn("Longitude", regexp_replace('Longitude', '[)"]', '').cast("float"))

#find the minimum and maximum values of longitude and latitude
maxLat = dataset.agg(max(dataset.Latitude)).head()[0]
minLat = dataset.agg(min(dataset.Latitude)).head()[0]
maxLon = dataset.agg(max(dataset.Longitude)).head()[0]
minLon = dataset.agg(min(dataset.Longitude)).head()[0]

#the id of each cell of the grid
ids = np.zeros(dataset.count())

stepLon = (maxLon - minLon) / 10
stepLat = (maxLat - minLat) / 10

df2 = dataset.withColumn("gridID", ((dataset.Longitude - minLon) / stepLon).cast("Int") * 10 + ((dataset.Latitude - minLat) / stepLat).cast("Int"))

#retain the distinct values of the cells which correspond the points
mapping = {k: i for i, k in enumerate(
    df2.select("gridID").distinct().rdd.flatMap(lambda x: x).collect()
)}

#partition by the distinct cell id of the grid
result = (df2
    .select("gridID", struct([c for c in df2.columns]))
    .rdd.partitionBy(len(mapping), lambda k: mapping[k])
    .values()
    .toDF(df2.schema))

print("Number of partitions: {}".format(result.rdd.getNumPartitions()))

partitions = result.rdd.glom().collect()
for i, l in enumerate(partitions): 
    print ("partition #{} length: {}".format(i, len(l)))

#result.write.partitionBy('gridID').format("csv").save('C:/Users/nikos/Desktop/gridPartitions')

#function which computes the jaccard similarity
def jaccard_similarity(str1, str2): 
        a = set(str1.split()) 
        b = set(str2.split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))

#function which computes the euclidean distance
def euclidean_distance(object1, object2):
    x1 = object1[3]
    x2 = object2[3]
    y1 = object1[2]
    y2 = object2[2]
    dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
    return dist

#---------------------------------------------- First function ----------------------------------------------

#first function which computes for each point against all, the distance and the textual similarity
#retain these that have distance less or equal to theta, and textual similarity greater than e
def func1(iterator):
    
    y = []
    
    for idx, obj in enumerate(iterator):
        x = []
        for objec in iterator:
            theta = 0.01
            e = 0.7
            
            if (objec!=obj) and (euclidean_distance(obj, objec) < theta) and (jaccard_similarity(obj[1], objec[1]) > e):
                x.append(objec[0])
                                 
        y.append(x)
    
    return y

#apply the first function to each partition
nearest_points_1 = result.rdd \
        .mapPartitions(func1) \
        .collect()
    
print("Nearest points for each partition from first function: {}".format(nearest_points_1))


#---------------------------------------------- Second function ----------------------------------------------

#second function that sorts the points based on the x axis, and compute the distance for each point from its next points
#until the distance is greater than theta, which stops the computation for this point against the others
def func2(iterator):
    
    #retain the values of each partition to a pandas dataframe and sort them based on the longitude
    d = pd.DataFrame([p[0], p[1], p[2], p[3]] for p in iterator)
    d.columns = ['id', 'text', 'latitude', 'longitude']
    d.sort_values(by=['longitude'])
    
    #create a list of lists which saves the nearest points of each point
    points = [[] for x in range(len(d.index))]
    
    #check each point from its next points, until the distance is greater than theta
    for i, row1 in enumerate(d.iterrows()):
        for j, row2 in enumerate(d[i+1:].iterrows()):
            theta = 0.001
            e = 0.7
            
            if euclidean_distance(row1[1], row2[1]) < theta:
                    if jaccard_similarity(row1[1][1], row2[1][1]) > e:
                        points[i].append(row2[1][0])
                        points[j].append(row1[1][0])
            else:
                break
                                         
    return points

#apply the second function to each partition
nearest_points_2 = result.rdd \
        .mapPartitions(func2) \
        .collect()
    
print("Nearest points for each partition from second function: {}".format(nearest_points_2))