In [1]:
from pyspark.sql import SparkSession
import csv
from math import radians, cos, sin, asin, sqrt
import pandas as pd
import math


In [2]:
def calculate_distance(lat1, lon1, lat2, lon2):
    R = 6371.0

    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)
    
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    a = math.sin(dlat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = R * c * 1000 #m
    return distance

In [3]:
def display_results(data):
    data_to_display = pd.DataFrame(data.take(10), columns=['Place', 'Distance'])
    display(data_to_display)

In [4]:
spark = SparkSession.builder.appName("CafeAnalyzer").getOrCreate()

In [5]:
columns = ['ID', 'Name', 'global_id', 'IsNetObject', 'OperatingCompany', 'TypeObject', 'AdmArea', 'District', 'Address', 'PublicPhone', 'SeatsCount', 'SocialPrivileges', 'Longitude_WGS84', 'Latitude_WGS84', 'geoData']

# Перед импортом файлов, они были предварительно залиты в hdfs с помощью команд:
./hdfs dfs -copyFromLocal /home/ubuntu/Desktop/places.csv /data

In [6]:
df = spark.read.csv("/data", header=False, inferSchema=True).toDF(*columns)

In [7]:
df.printSchema()


root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- global_id: integer (nullable = true)
 |-- IsNetObject: string (nullable = true)
 |-- OperatingCompany: string (nullable = true)
 |-- TypeObject: string (nullable = true)
 |-- AdmArea: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- PublicPhone: string (nullable = true)
 |-- SeatsCount: integer (nullable = true)
 |-- SocialPrivileges: string (nullable = true)
 |-- Longitude_WGS84: double (nullable = true)
 |-- Latitude_WGS84: double (nullable = true)
 |-- geoData: string (nullable = true)



In [8]:
rdd = df.rdd

In [9]:
lat, lng = 55.751244, 37.618423


In [10]:
distances = rdd.map(lambda x: (x['Name'], calculate_distance(lng, lat, x['Longitude_WGS84'], x['Latitude_WGS84'])))

In [11]:
display_results(distances)

Unnamed: 0,Place,Distance
0,Шоколадница,6969.517992
1,МУ-МУ,6965.617
2,КОМБИНАТ ПИТАНИЯ МГТУ ИМ.Н.Э.БАУМАНА,6917.673365
3,Дом 12,4893.184433
4,Чито-Ра,5032.89803
5,Бар- буфет «Николай»,4446.534553
6,Флорентини,4536.377751
7,Beer Gik,2017.362604
8,Погребок,2017.362604
9,Пробка Гриль,2154.4644


In [12]:
pairs = rdd.cartesian(rdd)
distances_between = pairs.map(lambda x: ((x[0]['Name'], x[1]['Name']), calculate_distance(x[0]['Longitude_WGS84'], x[0]['Latitude_WGS84'], x[1]['Longitude_WGS84'], x[1]['Latitude_WGS84'])))
filtered_distances = distances_between.filter(lambda x: x[0][0] != x[0][1]).distinct()

In [13]:
display_results(filtered_distances)

Unnamed: 0,Place,Distance
0,"(Шоколадница, МУ-МУ)",130.666047
1,"(Шоколадница, КОМБИНАТ ПИТАНИЯ МГТУ ИМ.Н.Э.БАУ...",674.525953
2,"(Шоколадница, Дом 12)",2119.366066
3,"(Шоколадница, Чито-Ра)",2002.204035
4,"(Шоколадница, Бар- буфет «Николай»)",2524.038187
5,"(Шоколадница, Флорентини)",2434.066442
6,"(Шоколадница, Beer Gik)",5016.245832
7,"(Шоколадница, Погребок)",5016.245832
8,"(Шоколадница, Пробка Гриль)",4876.10231
9,"(Шоколадница, TEMPO DI PASTA)",4880.967076


In [14]:
# Нахождение 10 наиболее близких и наиболее отдаленных заведений
closest = distances.takeOrdered(10, key=lambda x: x[1])
furthest = distances.takeOrdered(10, key=lambda x: -x[1])

In [15]:
distances_df = pd.DataFrame(closest, columns=['Pair', 'Distance'])
display(distances_df)

Unnamed: 0,Pair,Distance
0,Calabash Club,1425.016374
1,Залечь на дно,1425.016374
2,Политех,1425.016374
3,Антикафе Checkpoint,1425.016374
4,Шоколадница,1562.708553
5,БИБЛИОТЕКА Shisha Lounge,1588.875924
6,Му-Му,1588.875924
7,Мареа,1594.76978
8,Стейк Хаус «Бизон»,1602.388421
9,Софра,1613.146536


In [16]:
distances_df = pd.DataFrame(furthest, columns=['Pair', 'Distance'])
display(distances_df)

Unnamed: 0,Pair,Distance
0,МНИТИ,10624.894092
1,Школа 435,10057.758226
2,Школа 414,10051.011713
3,Византия,9871.888928
4,Мята Lounge,9547.175252
5,Bodrum,9420.643413
6,Ешь хлеб,9154.010995
7,Ля Фантази,9130.707662
8,ШефДонер,9130.707662
9,Take & Wake,9130.707662
