# PySpark Exercises 1.b 
---
Özgün Yargı

## Libraries

### Install Dependencies

In [1]:
!pip install pyspark haversine

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 36 kB/s 
[?25hCollecting haversine
  Downloading haversine-2.5.1-py2.py3-none-any.whl (6.1 kB)
Collecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 58.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=7f6bb33600c6db01a0a187465fcce867bdd061ff7ab2cc03279f61bd8cabab1c
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark, haversine
Successfully installed haversine-2.5.1 py4j-0.10.9.3 pyspark-3.2.1


### Import Libraries

In [2]:
from pyspark import SparkContext
from haversine import haversine
import datetime

## Get the Data

In [37]:
sc = SparkContext.getOrCreate()
rdd_earthquake = sc.textFile("EarthquakeDataset-Latest.txt")

## Main

In [38]:
rdd_earthquake.collect()[:2]

['No    \tEvent ID\tDate\tOrigin Time\tLatitude\tLongitude\tDepth(km)\txM\tMD\tML\tMw\tMs\tMb\tType\tLocation',
 '000001\t20220228094421\t2022.02.28\t09:44:21.35\t37.8750\t26.9258\t012.2\t3.6\t0.0\t3.6\t3.6\t0.0\t0.0\tKe\tKUSADASI KORFEZI (EGE DENIZI)']

In [39]:
rdd_earthquake.collect()[0].split("\t") # We need Date (2), Origin Time (3), Latitude (4), Longitude (5), xM (7), Location (14) 

['No    ',
 'Event ID',
 'Date',
 'Origin Time',
 'Latitude',
 'Longitude',
 'Depth(km)',
 'xM',
 'MD',
 'ML',
 'Mw',
 'Ms',
 'Mb',
 'Type',
 'Location']

In [40]:
# Get onlt the neccesarry information from the data

def filter_out_data (myline):
  sl = myline.split("\t")

  # There were issues regarding the time (in the data, there were a 73 second which is not a valid notation (cannot be bigger than 60))
  # Because of this, second, minute values that are higher than 60 are adjusted as they become valid to use in datetime function. 
  splitted_time = sl[3].split(":")
  second = float(splitted_time[2])%60
  minute = int((float(splitted_time[2])//60+float(splitted_time[1]))%60)
  hour = int((float(splitted_time[2])//60+float(splitted_time[1]))//60+int(splitted_time[0]))

  reorganized_hour = f"{hour}:{minute}:{second}"

  return (datetime.datetime.strptime(sl[2]+" "+reorganized_hour, '%Y.%m.%d %H:%M:%S.%f'), float(sl[4]), float(sl[5]), float(sl[7]), sl[14])

rdd_earthquake = rdd_earthquake.filter(lambda x: "Event ID" not in x)
rdd_earthquake_ = rdd_earthquake.map(lambda x: filter_out_data(x))

In [41]:
rdd_earthquake_.collect()[0]

(datetime.datetime(2022, 2, 28, 9, 44, 21, 350000),
 37.875,
 26.9258,
 3.6,
 'KUSADASI KORFEZI (EGE DENIZI)')

In [42]:
# Sort the earthquakes according to xM in descending order

sorted_earthqs = rdd_earthquake_.sortBy(lambda x: x[3], ascending=False)

In [43]:
# Filter out the data for 10 biggest earthquake as fore and aftershocks will be listed which occured in 24hours and in 20km range. 

rdd_top10 = sorted_earthqs.zipWithIndex().filter(lambda x: x[1] <10)

for i in rdd_top10.collect():
  low_range_rdd = rdd_earthquake_.filter(lambda x: haversine( (i[0][1],i[0][2]), (x[1],x[2]) ) <= 20)
  low_range_rdd = low_range_rdd.filter(lambda x: x[0]!=i[0][0])
  close_times_rdd = low_range_rdd.filter(lambda x: i[0][0]-datetime.timedelta(days=1) <= x[0] <= i[0][0]+datetime.timedelta(days=1))

  print("For earthquake ", i[0][4], " occured on ", i[0][0].year,"-",i[0][0].month,"-",i[0][0].day, " with magnitude ", i[0][3], "; Followings are the foreshocks and aftershocks:", sep="")
  print("-"*len("For earthquake {} occured on {}-{}-{} with magnitude {}; Followings are the foreshocks and aftershocks:".format(i[0][4],i[0][0].year,i[0][0].month,i[0][0].day,i[0][3])))
  for j in close_times_rdd.collect():
    print("  * Earthquake ", j[4], " occured on ", j[0].year,"-",j[0].month,"-",j[0].day, " with magnitude ", j[3], sep="")
  print("\n")

For earthquake KURUTILEK- (ERZINCAN) [North East  3.0 km] occured on 1939-12-26 with magnitude 7.9; Followings are the foreshocks and aftershocks:
--------------------------------------------------------------------------------------------------------------------------------------------------


For earthquake ONIKI ADALAR (AKDENIZ) occured on 1926-6-26 with magnitude 7.7; Followings are the foreshocks and aftershocks:
-----------------------------------------------------------------------------------------------------------------------------
  * Earthquake AKDENIZ occured on 1926-6-26 with magnitude 5.5


For earthquake TÜRKIYE-IRAN SINIR BÖLGESI occured on 1930-5-6 with magnitude 7.6; Followings are the foreshocks and aftershocks:
--------------------------------------------------------------------------------------------------------------------------------
  * Earthquake KIZILCA-BASKALE (VAN) [South East  20.7 km] occured on 1930-5-7 with magnitude 5.2
  * Earthquake KIZILCA-BASKALE 