# 0. Càrrega de les llibreries

In [1]:
import pandas as pd
import requests
import math
import time
import plotly.express as px
from bs4 import BeautifulSoup

# 1. Web scraping

### Funció earthquakes per a obtenir els terratrèmols

In [2]:
# function earthquakes

def getEarthquakes(start_date, end_date, min_depth, max_depth, min_mag, max_mag, region, min_intens, max_intens, file_name):
    
    # create the URL
    str_web = ('https://www.emsc-csem.org/Earthquake/?filter=yes' +
    '&start_date=' + start_date +
    '&end_date=' + end_date +
    '&min_depth=' + min_depth +
    '&max_depth=' + max_depth +
    '&min_mag=' + min_mag +
    '&max_mag=' + max_mag +
    '&region=' + region +
    '&min_intens=' + min_intens +
    '&max_intens=' + max_intens)
    
    # User Agent
    headers = {
    "Accept": "/",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "es-MX,es;q=0.8,en-US;q=0.5,en;q=0.3",
    "Cache-Control": "max-age=0",
    "DNT": "1",
    "Pragma": "no-cache",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0"
    }
    
    page = requests.get(str_web + '&view=1', headers=headers)
    #print(str_web + '&view=1')
    soup = BeautifulSoup(page.content)

    # check for results
    no_results_found = soup.body.findAll(text='No results found')

    if len(no_results_found) == 1:

        print('No results found')

    else:

        # look for the number of results for the selected input
        results_found = soup.body.find(id='content')
        results_found = results_found.findAll('tr')[18].td.text.split(' ')[3].strip()
        print('results found: ' + results_found, '\n')

        i = math.ceil(int(results_found) / 20)
        print('number of loops: ' + str(i), '\n')

        earthquake_list = []
        earthquake_id_comp = 0

        # loop through pages
        for j in range(i):

            t0 = time.time() #
            
            print('loop number: ' + str(j + 1))

            # get html from page n
            page = requests.get(str_web + '&view=' + str(j + 1))

            response_delay = time.time() -t0 #
            
            # BeautifulSoup content
            soup = BeautifulSoup(page.content)

            # look for earthquakes list in html
            results_list = soup.find(id='tbody')
            earthquakes = results_list.find_all('tr', class_=lambda x: x != 'autour' and x != 'black')

            # print number of earthquakes found
            if j == 0:
                print('found: ' + str(len(earthquakes)), '\n')
            else:
                print('found: ' + str(len(earthquakes)-1), '\n')

            # loop through earthquakes in every page
            for earthquake in earthquakes:

                # save all earthquakes' children
                children = earthquake.findChildren('td')

                # parameter earthquake id
                earthquake_id = earthquake['id']

                # declare array to save latitude, longitude and magnitude
                lat_lon = []

                # remove duplicates from each page greater than 1
                if earthquake_id_comp != earthquake_id:

                    # loop through values of each earthquake
                    for child in children:

                        # parameter DateTime
                        if child['class'][0] == 'tabev6':
                            # replace strange characters and divide it in date and time
                            dateTime = child.a.text.replace(u'\xa0', u' ').partition('   ')

                            #parameter date
                            date = dateTime[0]

                            #parameter time
                            time_utc = dateTime[2]

                        # parameters latitude and longitude (numeric)
                        if child['class'][0] == 'tabev1':
                            lat_lon.append(child.text)

                        # parameters latitude, longitude (cardinal direction) and magnitude
                        if child['class'][0] == 'tabev2':
                            lat_lon.append(child.text)

                        # parameter depth
                        if child['class'][0] == 'tabev3':
                            depth = child.text

                        # parameter magnitude type
                        if child['class'][0] == 'tabev5':
                            magnitude_type = child.text

                        # parameter region
                        if child['class'][0] == 'tb_region':
                            region = child.text.strip()

                        # parameter last update
                        if child['class'][0] == 'comment':
                            last_update = child.text

                    # remove strange characters from array
                    lat_lon = [w.replace(u'\xa0', u' ') for w in lat_lon]

                    # parameter latitude
                    latitude = lat_lon[0].strip()

                    # parameter cardinal direction latitude
                    latitude_card = lat_lon[1].strip()
                    
                    # parameter signed latitude
                    
                    if latitude_card == 'S':
                        signed_lat = -1*float(latitude)
                    else:
                        signed_lat = float(latitude)
                                        
                    # parameter longitude
                    longitude = lat_lon[2].strip()

                    # parameter cardinal direction longitude
                    longitude_card = lat_lon[3].strip()
                    
                    # parameter signed longitude
                    
                    if longitude_card == 'W':
                        signed_lon = -1*float(longitude)
                    else:
                        signed_lon = float(longitude)

                    # parameter magnitude
                    magnitude = lat_lon[4].strip()

                    # append parameters in array
                    earthquake_list.append([
                         date
                        ,time_utc
                        ,latitude
                        ,latitude_card
                        ,signed_lat
                        ,longitude
                        ,longitude_card
                        ,signed_lon
                        ,depth
                        ,magnitude_type
                        ,magnitude
                        ,region
                        ,last_update
                        ,earthquake_id
                    ]) 

                # save earthquake id from previous iteration to look for duplicates
                earthquake_id_comp = earthquake_id
                
                # hold on to avoid server saturation
                time.sleep(2 * response_delay) #
        
        # generate CSV
        df = pd.DataFrame(earthquake_list)
        df.to_csv(file_name, sep=';', index=False, header=
                ['Date'
                ,'Time UTC'
                ,'Latitude'
                ,'Latitude Cardinal'
                ,'Signed Latitude' 
                ,'Longitude'
                ,'Longitude Cardinal'
                ,'Signed Longitude'
                ,'Depth'
                ,'Magnitude Type'
                ,'Magnitude'
                ,'Region'
                ,'Last Update'
                ,'Earthquake ID'])


### Exemple de crida a la funció earthquakes

Per a obtenir les dades d'una regió en concret, cal anar a la web per a entendre el format. Per exemple, si es vol obtenir els terratrèmols de l'Espanya peninsular, s'ha d'escriure:

#### SPAIN. 

Si es vol obtenir els terratrèmols de les Illes Balears, cal escriure:

#### BALEARIC+ISLANDS,+SPAIN o 

I per als de les illes Canàries:

#### CANARY+ISLANDS+SPAIN+REGION 

Com es pot comprovar, el format és diferent. 

Si, en canvi, es vol obtenir els terratrèmols tant de l'Espanya peninsular com de la insular, cal escriure:

#### BALEARIC+ISLANDS,+SPAIN|CANARY+ISLANDS,+SPAIN+REGION|SPAIN 

(s'han de separar les regions amb el símbol "|"). 

Per a d'altres regions, cal anar a la següent pàgina i buscar la regió desitjada per a conèixer el format requerit:

https://www.emsc-csem.org/Earthquake/?filter=yes

In [3]:
start_date = '2020-08-01'
end_date = '2020-10-30'
min_depth = ''
max_depth = ''
min_mag = ''
max_mag = ''
region = 'BALEARIC+ISLANDS,+SPAIN|CANARY+ISLANDS,+SPAIN+REGION|SPAIN'
min_intens = ''
max_intens = ''

file_name = 'earthquakes-spain-aug-oct-2020.csv'

getEarthquakes(start_date, end_date, min_depth, max_depth, min_mag, max_mag, region, min_intens, max_intens, file_name)

results found: 174 

number of loops: 9 

loop number: 1
found: 20 

loop number: 2
found: 20 

loop number: 3
found: 20 

loop number: 4
found: 20 

loop number: 5
found: 20 

loop number: 6
found: 20 

loop number: 7
found: 20 

loop number: 8
found: 20 

loop number: 9
found: 14 



# Visualització

In [5]:
file_name = 'earthquakes-spain-aug-oct-2020.csv'

df = pd.read_csv(file_name, sep = ";", engine= "python")


fig = px.scatter_geo(df,
                     lon = df['Signed Longitude'],
                     lat = df['Signed Latitude'],
                     hover_data = [df['Region'], df['Signed Longitude'], df['Signed Latitude'], df['Magnitude']], 
                     size = 2.7**(df['Magnitude']),
                     projection='orthographic'
                     )
fig.show()