**Initializing Dask-SLURM cluster configuration**

In [None]:
from dask_jobqueue import SLURMCluster

In [2]:
cluster = SLURMCluster(
    cores=60,
    memory="120GB",
    name="quotebank_earthquake",
    processes=60
)

In [3]:
from dask.distributed import Client
client = Client(cluster)

In [4]:
cluster.scale_up(4)

## Earthquake quotes extraction notebook

In [5]:
import dask.bag as db
import dask.dataframe as dd
import dask.config

In [6]:
import numpy as np
import pandas as pd
import plotly.express as px
import re
import json

import pycountry

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import matplotlib
import matplotlib.pyplot as plt

import tqdm

In [7]:
disasters = pd.read_csv("data/dataset/emdat/emdat_public_2021_11_06.csv", sep="\t")

In [8]:
# sort disasters by total deaths

disasters = disasters.sort_values(by=["Total Deaths"], ascending=False)

In [9]:
# select only disasters that happened after 2008

disasters = disasters[disasters["Year"] >= 2008]
disasters = disasters[disasters["Disaster Type"] == "Earthquake"]

In [10]:
disasters.head()

Unnamed: 0,Dis No,Year,Seq,Glide,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Country,ISO,Region,Continent,Location,Origin,Associated Dis,Associated Dis2,OFDA Response,Appeal,Declaration,Aid Contribution,Dis Mag Value,Dis Mag Scale,Latitude,Longitude,Local Time,River Basin,Start Year,Start Month,Start Day,End Year,End Month,End Day,Total Deaths,No Injured,No Affected,No Homeless,Total Affected,Reconstruction Costs ('000 US$),Insured Damages ('000 US$),Total Damages ('000 US$),CPI,Adm Level,Admin1 Code,Admin2 Code,Geo Locations
0,2010-0017-HTI,2010,17,EQ-210-000009,Natural,Geophysical,Earthquake,Ground movement,,,Haiti,HTI,Caribbean,Americas,"Port-au-prince, Kenscoff municipalities (Port-...",,,,Yes,,,,7.0,Richter,18.443,-72.571,16:53,,2010,1.0,12.0,2010,1.0,12.0,222570.0,300000.0,3400000.0,,3700000.0,11500000.0,200000.0,8000000.0,84.252733,2,,17168;17177;17194;17197;17198;17201;17203,"Cayes, Croix-Des-Bouquets, Gonaives, Jacmel, J..."
3,2008-0192-CHN,2008,192,EQ-2008-000062,Natural,Geophysical,Earthquake,Ground movement,,,China,CHN,Eastern Asia,Asia,"Wenchuan Xian, Aba Xian areas (Ngawa Tibetan a...",,"Slide (land, mud, snow, rock)",,Yes,,,303274.0,8.0,Richter,31.002,103.322,14:28,,2008,5.0,12.0,2008,5.0,12.0,87476.0,366596.0,45610000.0,,45976596.0,,300000.0,85000000.0,83.189023,1;2,900;902;905;909;911;912;920;923;929,13255;13259;13260;13270,"Chongqing Shi, Gansu Sheng, Guizhou Sheng, Hen..."
11,2011-0082-JPN,2011,82,,Natural,Geophysical,Earthquake,Tsunami,,,Japan,JPN,Eastern Asia,Asia,"Hokkaidoo, Akita, Aomori, Yamagata, Miyagi, Iw...",,Fire,Industrial accidents,Yes,,,,9.0,Richter,38.297,142.373,14:46,,2011,3.0,11.0,2011,3.0,11.0,19846.0,5933.0,362887.0,,368820.0,,37500000.0,210000000.0,86.912465,1,1652;1653;1657;1659;1661;1663;1665;1668;1673;1...,,"Akita, Aomori, Gunma, Hokkaidoo, Hukusima, Iba..."
16,2015-0144-NPL,2015,144,EQ-2015-000048,Natural,Geophysical,Earthquake,Ground movement,,,Nepal,NPL,Southern Asia,Asia,"Gorkha area (Gandaki district, Western provinc...",,"Slide (land, mud, snow, rock)",,Yes,,,,8.0,Richter,28.23,84.731,11:56,,2015,4.0,25.0,2015,4.0,25.0,8831.0,17932.0,5621790.0,,5639722.0,,100000.0,5174000.0,91.579117,2,,22351;22352;22363,"Bagmati, Gandaki, Janakpur (Adm2)."
25,2018-0352-IDN,2018,352,EQ-2018-000122,Natural,Geophysical,Earthquake,Tsunami,,,Indonesia,IDN,South-Eastern Asia,Asia,"Dongalla, Sigi (dongalla), Parigi Moutong, Pal...",,Tsunami/Tidal wave,Liquefaction,Yes,Yes,Yes,,8.0,Richter,-178.0,119.84,,,2018,9.0,28.0,2018,9.0,28.0,4140.0,,2000000.0,,2000000.0,1600000.0,,1450000.0,97.023129,2,,73671;73741;73785;73821;73830,"Donggala, Kota Palu, Mamuju Utara, Parigimouto..."


In [11]:
# alternative english names for countries
with open('data/dataset/alternative_country_names.json') as json_file:
    alternative_country_names = json.load(json_file)

In [12]:
# similar words for naturals disasters based on https://relatedwords.org/
with open('data/dataset/similar_disaster_type_words.json') as json_file:
    similar_disaster_type_words = json.load(json_file)

In [13]:
def parse_geo_location(*texts):
    """ parses geo locations from disasters to to list """
    ret_set = set()
    for text in texts:
        if( pd.isna(text) ):
            continue
        
        text = text.replace("(Adm1). ", "").replace("(Adm2). ", "")

        text_without_parenthesis = re.sub(r"\([^()]*\)", "", text)

        text_splitted = text_without_parenthesis.split(",")

        stripped_text = [s.strip() for s in text_splitted]
        
        stripped_text = filter(lambda x: x != "", stripped_text)
        
        ret_set.update(stripped_text)
    return ret_set

In [14]:
def isascii(s):
    """ check if the characters in string s are in ASCII, U+0-U+7F."""
    return len(s) == len(s.encode())

In [15]:
def build_regex_from_groups(groups):
    """ converts "groups" to regex """
    
    regex = ""
    for group in groups:
        ascii_group = list(filter(isascii, group))
        
        if len(ascii_group) == 0:
            continue
        regex += "(?=.*" + "|.*".join(list(ascii_group)) + ")"
    return regex.lower()  

In [16]:
def create_regex_for_disaster(disaster):
    """ creates regex based on location and year and disaster type """
    
    country = next(item for item in alternative_country_names if item["alpha3code"] == disaster["ISO"])
    location_group = parse_geo_location(disaster["Geo Locations"], disaster["Country"], country["name"], country["otherNames"])
    
    year_group = str(disaster["Year"])

    disaster_type_group = [disaster["Disaster Type"]]
    if disaster["Disaster Type"] in similar_disaster_type_words:
        disaster_type_group += similar_disaster_type_words[disaster["Disaster Type"]]
    
    return build_regex_from_groups([[year_group], location_group]).lower()


disasters["regex"] = disasters.apply(lambda disaster: create_regex_for_disaster(disaster), axis=1)

In [17]:
disasters_regex = disasters[["Dis No", "regex"]].set_index("Dis No").to_dict('index')

In [18]:
for disaster_regex_key in list(disasters_regex.keys())[:5]:
    print(disasters_regex[disaster_regex_key]["regex"], end="\n\n")


(?=.*2010)(?=.*gonaives|.*haiti|.*croix-des-bouquets|.*port-au-prince|.*leogane|.*jeremie|.*jacmel|.*cayes)

(?=.*2008)(?=.*ch'in empire|.*china|.*henan sheng|.*chongqing shi|.*mainland china|.*deyang|.*gansu sheng|.*new china|.*hunan sheng|.*guizhou sheng|.*red china|.*prc|.*yunnan sheng chengdu|.*cathay|.*ngawa tibetan and qiang|.*shenzhou|.*shaanxi sheng|.*hubei sheng|.*mianyang|.*communist china|.*zhongguo|.*shanxi sheng|.*people's republic of china)

(?=.*2011)(?=.*gipangu|.*hinomoto|.*yamagata|.*aomori|.*akita|.*tookyoo|.*miyagi|.*totigi|.*hokkaidoo|.*nihon|.*nagano|.*tiba|.*saitama|.*yamato|.*kanagawa|.*hukusima|.*nippon|.*japan|.*gunma|.*zipangu|.*cipangu|.*ibaraki|.*iwate)

(?=.*2015)(?=.*janakpur|.*gandaki|.*bagmati|.*federal democratic republic of nepal|.*nepal)

(?=.*2018)(?=.*indunesia|.*republik indonesia|.*dutch east indies|.*mamuju utara|.*indonesia|.*netherlands east-indies|.*poso|.*insulinde|.*parigimoutong|.*hindia belanda|.*republic of indonesia|.*kota palu|.*dongga

In [None]:
DATASET_FAST_PATH = "data/dataset/quotebank"

quotebank_df_array = [dd.read_parquet(f"{DATASET_FAST_PATH}/quotes-{year}.parquet") for year in range(2008, 2021)]

dataset = dd.concat(quotebank_df_array, interleave_partitions=True)

In [None]:
# outputs the quotes that contain any of the given words

earthquake_dataset = dataset[dataset.quotation.str.lower().str.contains("(?=.*earthquake|.*earthquake|.*quake|.*aftershock|.*tremor|.*temblor|.*epicenter|.*seismic|.*magnitude|.*crust|.*seism)", regex=True)]

earthquake_dataset.to_parquet(f"data/dataset/earthquake_dataset.parquet", schema="infer")

In [19]:
earthquake_db = dd.read_parquet("data/dataset/earthquake_dataset.parquet")

earthquake_db = earthquake_db.repartition(npartitions = 200)

earthquake_db = earthquake_db.persist()

In [None]:
# manual filtering of earthquakes

regex = build_regex_from_groups([["mae lao"]])
print(regex)

filtered_dataset = earthquake_db[earthquake_db.quotation.str.lower().str.contains(regex, regex=True)]

filtered_dataset.to_csv(f"data/dataset/earthquake_dataset_labels/2014-0159-THA.csv", single_file=True)

In [22]:
cluster.close()