In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
import regex as re
import plotly.express as px
import requests
from bs4 import BeautifulSoup as BS
from io import StringIO
%matplotlib inline

## Read in data from Wikipedia page

Using pd.read_html : https://www.marsja.se/how-to-use-pandas-read_html-to-scrape-data-from-html-tables/ 

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_deadly_earthquakes_since_1900'

In [3]:
wiki_page = pd.read_html(url)

In [4]:
type(wiki_page)

list

In [5]:
#discern how many tables there are in the HTML code for the page
len(wiki_page)

5

In [6]:
#access the table and save it to variable
wiki=wiki_page[4]

In [7]:
#verify data type
type(wiki)

pandas.core.frame.DataFrame

In [8]:
wiki.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 12 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Origin (UTC)                                       1340 non-null   object 
 1   Present-day country and link to Wikipedia article  1340 non-null   object 
 2   Lat                                                1325 non-null   object 
 3   Long                                               1325 non-null   object 
 4   Depth (km)                                         1250 non-null   object 
 5   Magnitude                                          1339 non-null   object 
 6   Secondary Effects                                  373 non-null    object 
 7   PDE Shaking Deaths                                 738 non-null    float64
 8   PDE Total Deaths                                   749 non-null    float64
 9   Utsu Tot

## Cleaning the data

#### Rename columns

In [9]:
#rename columns
wiki = wiki.rename(columns={"Origin (UTC)":"origin", "Present-day country and link to Wikipedia article":"country",
       "Lat":"lat", "Long":"long", "Depth (km)":"depth_km", "Magnitude":"magnitude", "Secondary Effects":"secondary_effects",
       "PDE Shaking Deaths":"pde_shaking_deaths", "PDE Total Deaths":"pde_total_deaths", "Utsu Total Deaths":"utsu_total_deaths",
       "EM-DAT Total Deaths":"emdat_total_deaths", "Other Source Deaths":"other_source_deaths"})

#### Locate and remove footnote notations. eg: '8.8 Mw [18]'

In [10]:
wiki[['magnitude','emdat_total_deaths', 'other_source_deaths']]= wiki[['magnitude','emdat_total_deaths', 'other_source_deaths']].apply(lambda x: x.str.replace('( \[\d{1,2}\])',''), axis=1)

#### Locate and remove "(see....)" in country column & clean up country names

In [11]:
wiki['country']= wiki['country'].str.replace(
    '( \(.*\)?)','').str.replace(
    '(\(see 1997 Cariaco earthquake\))','Venezuela').str.replace(
    '(Iran, 2005 Qeshm earthquake)','Iran').str.replace(
    'US Territory of Alaska','United States').str.replace(
    'Mandatory Palestine and Transjordan','Palestine').str.replace(
    'Burma','Myanmar').str.replace(
    'Tajik Soviet Socialist Republic','Tajikistan').str.replace(
    'Kingdom of Yugoslavia','Yugoslavia'
)

#### Deaths - calculate the greatest number of deaths reported for each event.

In [13]:
#Alvin's version
# delete other sources column
#del wiki['other_source_deaths']

In [17]:
#Remove footnotes bracket from other_source_deaths & emdat_total_deaths column. In cells where there is more than one value noted, it also occurs with footnotes. Keep only the first value.
wiki['other_source_deaths']= wiki['other_source_deaths'].str.replace(
    '(\*|\+)','').str.replace('(\[\d{1,2}\]?.*)','').str.replace('(( .*))','').str.replace('(,)','')
                                                                                                            
wiki['emdat_total_deaths']= wiki['emdat_total_deaths'].str.replace('(\[\d{1,2}\]?.*)','')      

In [22]:
#change to numeric
wiki[['other_source_deaths','emdat_total_deaths']]=wiki[['other_source_deaths','emdat_total_deaths']].apply(pd.to_numeric)

In [37]:
#wiki[['other_source_deaths','emdat_total_deaths']]=wiki[['other_source_deaths','emdat_total_deaths']].astype(float)

In [24]:
# new column for largest death in column
wiki['deaths'] = wiki[['pde_shaking_deaths', 'pde_total_deaths',
       'utsu_total_deaths', 'emdat_total_deaths','other_source_deaths']].max(axis=1)

#### Split magnitude column on delimiter to create two columns for magnitude value and magnitude scale

In [28]:
wiki[['magnitude','magnitude_scale']]= wiki['magnitude'].str.split(' ', n=1, expand=True)

In [18]:
#Alvin's version run in lieu of removing footnote notations
#split on space to create new columns (there is third column because second blank space occurred for footnotes...deleted in code below)
#wiki[['magnitude','measure','del']] = wiki['magnitude'].str.split(' ',expand=True)

#created three columns deleted non essential
#del wiki['del']

#### Convert 'origin' column to datetime

In [61]:
wiki['origin'] = pd.to_datetime(wiki['origin'])

#### Change depth_km to numeric

In [48]:
#Remove ? and replace with NaN. Can't use str.replace in this case, need to use .replace and pass in argument regex=True
wiki['depth_km']=wiki['depth_km'].replace('\?', np.NaN, regex=True).astype(float)

#### Change magnitude to numeric

In [54]:
wiki['magnitude']=wiki['magnitude'].astype(float)

In [60]:
wiki.head()

Unnamed: 0,origin,country,lat,long,depth_km,magnitude,secondary_effects,pde_shaking_deaths,pde_total_deaths,utsu_total_deaths,emdat_total_deaths,other_source_deaths,deaths,magnitude_scale,year
0,1900-05-11 17:23,Japan,38.7,141.1,5.0,7.0,,,,,,,,MJMA,1900
1,1900-07-12 06:25,Turkey,40.3,43.1,,5.9,,,,140.0,,,140.0,Muk,1900
2,1900-10-29 09:11,Venezuela,11.0,-66.0,0.0,7.7,,,,,,,,Mw,1900
3,1901-02-15 00:00,China,26.0,100.1,0.0,6.5,,,,,,,,Ms,1901
4,1901-03-31 07:11,Bulgaria,43.4,28.7,,6.4,,,,4.0,,,4.0,Muk,1901


## What Factors Make Earthquakes More Likely?

In [75]:
fig = px.histogram(wiki, x="origin", histfunc='count',title='Frequency of Major Earthquakes Over Time')
fig.show()

In [95]:
death_size = wiki.dropna(subset=['deaths'], axis = 0)
fig = px.scatter(death_size, x='origin', y='magnitude',size='deaths',color='depth_km',hover_data=wiki, title='Earthquakes by Magnitude & Death Toll Over Time')
fig.show()

In [80]:
fig = px.histogram(wiki, x="magnitude", histfunc='count',title='Distribution of Earthquakes by Magnitude')
fig.show()

In [97]:
fig = px.scatter(wiki, x='magnitude', y='deaths',color='depth_km',hover_data=wiki, title='Magnitude & Deaths')
fig.show()

In [98]:
fig = px.histogram(wiki, x="country", histfunc='count',title='Distribution of Earthquakes by Country')
fig.show()