# PySpark Exercises 1.a.ii 
---
Özgün Yargı

## Libraries

### Install Dependencies

In [None]:
!pip install pyspark haversine

Collecting haversine
  Downloading haversine-2.5.1-py2.py3-none-any.whl (6.1 kB)
Installing collected packages: haversine
Successfully installed haversine-2.5.1


### Import Libraries

In [None]:
from pyspark import SparkContext
import math
from haversine import haversine # Distance calculator

## Get the Data

In [None]:
sc = SparkContext.getOrCreate()
rdd_capitals = sc.textFile("Capitals.txt")

## Main

### Reorganize the File

In [None]:
# Reorganize each line as they become in the same format and split each information

rdd_capitals = rdd_capitals.map(lambda x: x[:-1] if "\t" in x[-1] else x).map(lambda x: x.split("\t"))

In [None]:
rdd_capitals.collect()

[['Afghanistan Flag Icon ', 'Afghanistan ', 'Kabul ', '34,53 ', '69,17'],
 ['Albania Flag Icon ', 'Albania ', 'Tirana ', '41,33 ', '19,82'],
 ['Algeria Flag Icon ', 'Algeria ', 'Algiers ', '36,75 ', '3,04'],
 ['American Samoa Flag Icon ',
  'American Samoa ',
  'Pago Pago ',
  '-14,28 ',
  '-170,70'],
 ['Andorra Flag Icon ', 'Andorra ', 'Andorra la Vella ', '42,51 ', '1,52'],
 ['Angola Flag Icon ', 'Angola ', 'Luanda ', '-8,84 ', '13,23'],
 ['Anguilla Flag Icon ', 'Anguilla ', 'The Valley ', '18,22 ', '-63,06'],
 ['Antigua and Barbuda Flag Icon ',
  'Antigua and Barbuda ',
  "St. John's ",
  '17,12 ',
  '-61,85'],
 ['Argentina Flag Icon ', 'Argentina ', 'Buenos Aires ', '-34,61 ', '-58,38'],
 ['Armenia Flag Icon ', 'Armenia ', 'Yerevan ', '40,18 ', '44,51'],
 ['Aruba Flag Icon ', 'Aruba ', 'Oranjestad ', '12,52 ', '-70,03'],
 ['Australia Flag Icon ', 'Australia ', 'Canberra ', '-35,28 ', '149,13'],
 ['Austria Flag Icon ', 'Austria ', 'Vienna ', '48,21 ', '16,37'],
 ['Azerbaijan Flag Ic

In [None]:
# Create a dictionary that holds only the relevant information

rdd_capitals_ = rdd_capitals.map(lambda x: {"City": x[2].strip(), "Latitude": float(x[3].strip().replace(",", ".")), "Longitude": float(x[4].strip().replace(",", ".")) })
rdd_capitals_.collect()

[{'City': 'Kabul', 'Latitude': 34.53, 'Longitude': 69.17},
 {'City': 'Tirana', 'Latitude': 41.33, 'Longitude': 19.82},
 {'City': 'Algiers', 'Latitude': 36.75, 'Longitude': 3.04},
 {'City': 'Pago Pago', 'Latitude': -14.28, 'Longitude': -170.7},
 {'City': 'Andorra la Vella', 'Latitude': 42.51, 'Longitude': 1.52},
 {'City': 'Luanda', 'Latitude': -8.84, 'Longitude': 13.23},
 {'City': 'The Valley', 'Latitude': 18.22, 'Longitude': -63.06},
 {'City': "St. John's", 'Latitude': 17.12, 'Longitude': -61.85},
 {'City': 'Buenos Aires', 'Latitude': -34.61, 'Longitude': -58.38},
 {'City': 'Yerevan', 'Latitude': 40.18, 'Longitude': 44.51},
 {'City': 'Oranjestad', 'Latitude': 12.52, 'Longitude': -70.03},
 {'City': 'Canberra', 'Latitude': -35.28, 'Longitude': 149.13},
 {'City': 'Vienna', 'Latitude': 48.21, 'Longitude': 16.37},
 {'City': 'Baku', 'Latitude': 40.38, 'Longitude': 49.89},
 {'City': 'Nassau', 'Latitude': 25.06, 'Longitude': -77.34},
 {'City': 'Manama', 'Latitude': 26.22, 'Longitude': 50.58},


### Find the Distance

In [None]:
# Create a rdd that compares each city one by one with the others

rdd_cartesian = rdd_capitals_.cartesian(rdd_capitals_)

In [None]:
# Calculate the distances

def calculate_distance(mytuple):

  result_dict = {}

  city1 = mytuple[0]["City"]
  city2 = mytuple[1]["City"]

  lat_1 = mytuple[0]["Latitude"]
  lat_2 = mytuple[1]["Latitude"]

  lon_1 = mytuple[0]["Longitude"]
  lon_2 = mytuple[1]["Longitude"]

  return (f"{city1}_{city2}", haversine((lat_1, lon_1), (lat_2, lon_2)))

rdd_distance = rdd_cartesian.map(lambda x: calculate_distance(x)).filter(lambda x: x[0].split("_")[0] != x[0].split("_")[1])

### Print the Result

In [None]:
# Sort them in descending order

rdd_sorted = rdd_distance.sortBy(lambda x: x[1], ascending=False)

In [None]:
print(f"Furthest two capitals are: {rdd_sorted.collect()[0][0]} with a distance {rdd_sorted.collect()[0][1]}")
print(f"Second most furthest two capitals are: {rdd_sorted.collect()[2][0]} with a distance {rdd_sorted.collect()[2][1]}")

Furthest two capitals are: Asunción_Taipei with a distance 19927.084804593487
Second most furthest two capitals are: Manama_Adamstown with a distance 19870.207778600252
