In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import random
import os 
import sys
import warnings
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import plotly.express as px
from datetime import datetime
from owid import catalog


## Data Acquisition

In [2]:
# Use pandas `read_html` method to Web Scrape data from https://ourworldindata.org/the-worlds-deadliest-earthquakes
url = 'https://ourworldindata.org/the-worlds-deadliest-earthquakes'
df_list = pd.read_html(url)
df = df_list[0]
df

Unnamed: 0,Ranking,Location,Year,Estimated death toll,Earthquake magnitude,Additional information
0,1,"Shaanxi, China",1556,830000,8,More than 97 counties in China were affected. ...
1,2,"Port-au-Prince, Haiti",2010,316000,7,Death toll is still disputed. Here we present ...
2,3,"Antakya, Turkey",115,260000,7.5,Antioch (ancient ruins which lie near the mode...
3,4,"Antakya, Turkey",525,250000,7,Severe damage to the area of the Byzantine Emp...
4,5,"Tangshan, China",1976,242769,7.5,Reported that the earthquake risk had been gre...
5,6,"Gyzndzha, Azerbaijan",1139,230000,Unknown,Often termed the Ganja earthquake. Much less i...
6,7,"Sumatra, Indonesia",2004,227899,9.1,Earthquake in Indian Ocean off the coast of Su...
7,8,"Damghan, Iran",856,200000,7.9,Estimated that extent of the damage area was 2...
8,8,"Gansu, China",1920,200000,8.3,Damage occurred across 7 provinces and regions...
9,9,"Dvin, Armenia",893,150000,Unknown,"City of Dvin was destroyed, with the collapse ..."


In [3]:
# Acquire the natural disaster dataset using owid-catalog API
data = catalog.find('natural_disasters')
data = data.iloc[3].load()
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_dead,injured,affected,homeless,total_affected,reconstruction_costs,insured_damages,total_damages,n_events,population,gdp,reconstruction_costs_per_gdp,insured_damages_per_gdp,total_damages_per_gdp,total_dead_per_100k_people,injured_per_100k_people,affected_per_100k_people,homeless_per_100k_people,total_affected_per_100k_people,n_events_per_100k_people
country,year,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Afghanistan,1954,all_disasters,2000,0,0,0,0,0,0,0,1,7864289,,,,,25.431416,0.000000,0.000000,0.0,0.000000,0.012716
Afghanistan,1954,earthquake,2000,0,0,0,0,0,0,0,1,7864289,,,,,25.431416,0.000000,0.000000,0.0,0.000000,0.012716
Afghanistan,1956,all_disasters,151,2000,0,0,2000,0,0,25000000,2,8087730,,,,,1.867026,24.728817,0.000000,0.0,24.728817,0.024729
Afghanistan,1956,earthquake,100,2000,0,0,2000,0,0,25000000,1,8087730,,,,,1.236441,24.728817,0.000000,0.0,24.728817,0.012364
Afghanistan,1956,flood,51,0,0,0,0,0,0,0,1,8087730,,,,,0.630585,0.000000,0.000000,0.0,0.000000,0.012364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe,2022,all_disasters,0,0,3000,0,3000,0,0,0,2,16320539,,,,,0.000000,0.000000,18.381746,0.0,18.381746,0.012254
Zimbabwe,2022,extreme_weather,0,0,3000,0,3000,0,0,0,1,16320539,,,,,0.000000,0.000000,18.381746,0.0,18.381746,0.006127
Zimbabwe,2022,flood,0,0,0,0,0,0,0,0,1,16320539,,,,,0.000000,0.000000,0.000000,0.0,0.000000,0.006127
Zimbabwe,2023,all_disasters,2,0,0,0,0,0,0,0,1,16665407,,,,,0.012001,0.000000,0.000000,0.0,0.000000,0.006000


In [4]:
data.reset_index(inplace=True)
data

Unnamed: 0,country,year,type,total_dead,injured,affected,homeless,total_affected,reconstruction_costs,insured_damages,...,gdp,reconstruction_costs_per_gdp,insured_damages_per_gdp,total_damages_per_gdp,total_dead_per_100k_people,injured_per_100k_people,affected_per_100k_people,homeless_per_100k_people,total_affected_per_100k_people,n_events_per_100k_people
0,Afghanistan,1954,all_disasters,2000,0,0,0,0,0,0,...,,,,,25.431416,0.000000,0.000000,0.0,0.000000,0.012716
1,Afghanistan,1954,earthquake,2000,0,0,0,0,0,0,...,,,,,25.431416,0.000000,0.000000,0.0,0.000000,0.012716
2,Afghanistan,1956,all_disasters,151,2000,0,0,2000,0,0,...,,,,,1.867026,24.728817,0.000000,0.0,24.728817,0.024729
3,Afghanistan,1956,earthquake,100,2000,0,0,2000,0,0,...,,,,,1.236441,24.728817,0.000000,0.0,24.728817,0.012364
4,Afghanistan,1956,flood,51,0,0,0,0,0,0,...,,,,,0.630585,0.000000,0.000000,0.0,0.000000,0.012364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21700,Zimbabwe,2022,all_disasters,0,0,3000,0,3000,0,0,...,,,,,0.000000,0.000000,18.381746,0.0,18.381746,0.012254
21701,Zimbabwe,2022,extreme_weather,0,0,3000,0,3000,0,0,...,,,,,0.000000,0.000000,18.381746,0.0,18.381746,0.006127
21702,Zimbabwe,2022,flood,0,0,0,0,0,0,0,...,,,,,0.000000,0.000000,0.000000,0.0,0.000000,0.006127
21703,Zimbabwe,2023,all_disasters,2,0,0,0,0,0,0,...,,,,,0.012001,0.000000,0.000000,0.0,0.000000,0.006000


## Data Preparation and Cleaning

In [5]:
# remove all rows whose value under "year" column is less than 2000
data = data[data['year'] >= 2000]
data
# extract rows whose type is "earthquake" and store it in a new dataframe
earthquake_data = data[data['type'] == 'earthquake']
earthquake_data


Unnamed: 0,country,year,type,total_dead,injured,affected,homeless,total_affected,reconstruction_costs,insured_damages,...,gdp,reconstruction_costs_per_gdp,insured_damages_per_gdp,total_damages_per_gdp,total_dead_per_100k_people,injured_per_100k_people,affected_per_100k_people,homeless_per_100k_people,total_affected_per_100k_people,n_events_per_100k_people
67,Afghanistan,2001,earthquake,4,20,0,250,270,0,0,...,,,,,0.020316,0.101581,0.000000,1.269768,1.371350,0.010158
70,Afghanistan,2002,earthquake,1200,1391,89500,10000,100891,0,0,...,3.854235e+09,0.000000,0.000000,0.000000,5.714215,6.623728,426.185242,47.618462,480.427429,0.014286
74,Afghanistan,2003,earthquake,1,1,0,1000,1001,0,0,...,4.539501e+09,0.000000,0.000000,0.000000,0.004416,0.004416,0.000000,4.415959,4.420375,0.004416
77,Afghanistan,2004,earthquake,2,40,0,1000,1040,0,0,...,5.220824e+09,0.000000,0.000000,0.000000,0.008491,0.169826,0.000000,4.245644,4.415470,0.004246
80,Afghanistan,2005,earthquake,6,1,500,0,501,0,0,...,6.226199e+09,0.000000,0.000000,0.000803,0.024579,0.004096,2.048240,0.000000,2.052337,0.008193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21472,World,2019,earthquake,261,4901,1628825,8318,1642044,0,50000000,...,8.764526e+13,0.000000,0.000057,0.002749,0.003361,0.063117,20.976629,0.107122,21.146868,0.000399
21481,World,2020,earthquake,196,2964,353878,28590,385432,9450000000,400000000,...,8.510561e+13,0.011104,0.000470,0.017458,0.002500,0.037802,4.513202,0.364624,4.915627,0.000204
21490,World,2021,earthquake,2742,15295,1304450,9763,1329508,1978063000,2300000000,...,9.651307e+13,0.002050,0.002383,0.011714,0.034668,0.193380,16.492620,0.123437,16.809437,0.000354
21500,World,2022,earthquake,1626,12673,3593454,9984,3616111,0,2800000000,...,,,,,0.020388,0.158907,45.058392,0.125190,45.342487,0.000389


In [6]:
# Remove data rows to "World", "European Union (27)" and Continents under "country" column
cleaned_data = earthquake_data[~earthquake_data['country'].isin(['World', 'European Union (27)', 'Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America'])]
cleaned_data


Unnamed: 0,country,year,type,total_dead,injured,affected,homeless,total_affected,reconstruction_costs,insured_damages,...,gdp,reconstruction_costs_per_gdp,insured_damages_per_gdp,total_damages_per_gdp,total_dead_per_100k_people,injured_per_100k_people,affected_per_100k_people,homeless_per_100k_people,total_affected_per_100k_people,n_events_per_100k_people
67,Afghanistan,2001,earthquake,4,20,0,250,270,0,0,...,,,,,0.020316,0.101581,0.000000,1.269768,1.371350,0.010158
70,Afghanistan,2002,earthquake,1200,1391,89500,10000,100891,0,0,...,3.854235e+09,0.0,0.0,0.000000,5.714215,6.623728,426.185242,47.618462,480.427429,0.014286
74,Afghanistan,2003,earthquake,1,1,0,1000,1001,0,0,...,4.539501e+09,0.0,0.0,0.000000,0.004416,0.004416,0.000000,4.415959,4.420375,0.004416
77,Afghanistan,2004,earthquake,2,40,0,1000,1040,0,0,...,5.220824e+09,0.0,0.0,0.000000,0.008491,0.169826,0.000000,4.245644,4.415470,0.004246
80,Afghanistan,2005,earthquake,6,1,500,0,501,0,0,...,6.226199e+09,0.0,0.0,0.000803,0.024579,0.004096,2.048240,0.000000,2.052337,0.008193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20207,Upper-middle-income countries,2021,earthquake,12,866,130674,2437,133977,0,0,...,2.710409e+13,0.0,0.0,0.004287,0.000471,0.033978,5.127048,0.095617,5.256642,0.000392
20215,Upper-middle-income countries,2022,earthquake,134,599,274188,2424,277211,0,0,...,,,,,0.005245,0.023444,10.731194,0.094871,10.849508,0.000509
20223,Upper-middle-income countries,2023,earthquake,50118,108134,9235462,468,9344064,100000000000,0,...,,,,,1.956782,4.221930,360.584747,0.018272,364.824951,0.000273
20294,Uzbekistan,2011,earthquake,13,0,0,0,0,0,0,...,6.017891e+10,0.0,0.0,0.000000,0.044739,0.000000,0.000000,0.000000,0.000000,0.003441


In [7]:
# Additionally  remove the names from the following list: ['Upper-middle-income countries', 'Lower-middle-income countries', 'High-income countries', 'Low-income countries'], which are present in the "country" column
cleaned_data = cleaned_data[~cleaned_data['country'].isin(['Upper-middle-income countries', 'Lower-middle-income countries', 'High-income countries', 'Low-income countries'])]
cleaned_data

Unnamed: 0,country,year,type,total_dead,injured,affected,homeless,total_affected,reconstruction_costs,insured_damages,...,gdp,reconstruction_costs_per_gdp,insured_damages_per_gdp,total_damages_per_gdp,total_dead_per_100k_people,injured_per_100k_people,affected_per_100k_people,homeless_per_100k_people,total_affected_per_100k_people,n_events_per_100k_people
67,Afghanistan,2001,earthquake,4,20,0,250,270,0,0,...,,,,,0.020316,0.101581,0.000000,1.269768,1.371350,0.010158
70,Afghanistan,2002,earthquake,1200,1391,89500,10000,100891,0,0,...,3.854235e+09,0.0,0.000000,0.000000,5.714215,6.623728,426.185242,47.618462,480.427429,0.014286
74,Afghanistan,2003,earthquake,1,1,0,1000,1001,0,0,...,4.539501e+09,0.0,0.000000,0.000000,0.004416,0.004416,0.000000,4.415959,4.420375,0.004416
77,Afghanistan,2004,earthquake,2,40,0,1000,1040,0,0,...,5.220824e+09,0.0,0.000000,0.000000,0.008491,0.169826,0.000000,4.245644,4.415470,0.004246
80,Afghanistan,2005,earthquake,6,1,500,0,501,0,0,...,6.226199e+09,0.0,0.000000,0.000803,0.024579,0.004096,2.048240,0.000000,2.052337,0.008193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19529,United States,2016,earthquake,0,0,120,0,120,0,0,...,1.869511e+13,0.0,0.000000,0.000107,0.000000,0.000000,0.036674,0.000000,0.036674,0.000306
19546,United States,2019,earthquake,0,0,150,0,150,0,50000000,...,2.138098e+13,0.0,0.000234,0.000935,0.000000,0.000000,0.044867,0.000000,0.044867,0.000299
19563,United States,2022,earthquake,2,12,0,0,12,0,0,...,,,,,0.000591,0.003547,0.000000,0.000000,0.003547,0.000296
20294,Uzbekistan,2011,earthquake,13,0,0,0,0,0,0,...,6.017891e+10,0.0,0.000000,0.000000,0.044739,0.000000,0.000000,0.000000,0.000000,0.003441


In [8]:
# convert "total_dead" column to numeric
cleaned_data['total_dead'] = pd.to_numeric(cleaned_data['total_dead'])
# now sort the dataframe in descending order of "total_dead" column
cleaned_data = cleaned_data.sort_values(by='total_dead', ascending=False)
cleaned_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Unnamed: 0,country,year,type,total_dead,injured,affected,homeless,total_affected,reconstruction_costs,insured_damages,...,gdp,reconstruction_costs_per_gdp,insured_damages_per_gdp,total_damages_per_gdp,total_dead_per_100k_people,injured_per_100k_people,affected_per_100k_people,homeless_per_100k_people,total_affected_per_100k_people,n_events_per_100k_people
7518,Haiti,2010,earthquake,222570,300000,3400000,0,3700000,11500000000,200000000,...,1.185931e+10,96.970192,1.686438,67.457527,2261.227783,3047.887451,34542.726562,0.000000,37590.613281,0.010160
9117,Indonesia,2004,earthquake,165816,1253,139580,532898,673731,0,225041000,...,2.568369e+11,0.000000,0.087620,1.759716,73.389854,0.554575,61.777847,235.859665,298.192078,0.002656
4046,China,2008,earthquake,87564,368412,46587000,414385,47369797,10000000000,300000000,...,4.594307e+12,0.217661,0.006530,1.860825,6.582932,27.696669,3502.341797,31.152851,3561.191162,0.000526
14727,Pakistan,2005,earthquake,73338,128309,0,5000000,5128309,5200000000,0,...,1.200553e+11,4.331337,0.000000,4.331337,42.058334,73.583450,0.000000,2867.431152,2941.014648,0.000573
18845,Turkey,2023,earthquake,50103,107608,9100000,90,9207698,100000000000,0,...,,,,,58.384087,125.393585,10604.059570,0.104875,10729.557617,0.003496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5274,Egypt,2002,earthquake,0,0,250,0,250,0,0,...,8.514607e+10,0.000000,0.000000,0.000000,0.000000,0.000000,0.336050,0.000000,0.336050,0.001344
5346,El Salvador,2006,earthquake,0,0,16470,0,16470,0,0,...,1.599989e+10,0.000000,0.000000,0.000000,0.000000,0.000000,272.933411,0.000000,272.933411,0.016572
5371,El Salvador,2018,earthquake,0,0,2499,0,2499,0,0,...,2.602085e+10,0.000000,0.000000,0.000000,0.000000,0.000000,39.816200,0.000000,39.816200,0.015933
6695,France,2019,earthquake,0,4,750,0,754,0,0,...,2.728870e+12,0.000000,0.000000,0.000000,0.000000,0.006211,1.164601,0.000000,1.170812,0.001553


In [9]:
# pick 5 rows from cleaned_data
cleaned_data = cleaned_data.head(5)
cleaned_data

Unnamed: 0,country,year,type,total_dead,injured,affected,homeless,total_affected,reconstruction_costs,insured_damages,...,gdp,reconstruction_costs_per_gdp,insured_damages_per_gdp,total_damages_per_gdp,total_dead_per_100k_people,injured_per_100k_people,affected_per_100k_people,homeless_per_100k_people,total_affected_per_100k_people,n_events_per_100k_people
7518,Haiti,2010,earthquake,222570,300000,3400000,0,3700000,11500000000,200000000,...,11859310000.0,96.970192,1.686438,67.457527,2261.227783,3047.887451,34542.726562,0.0,37590.613281,0.01016
9117,Indonesia,2004,earthquake,165816,1253,139580,532898,673731,0,225041000,...,256836900000.0,0.0,0.08762,1.759716,73.389854,0.554575,61.777847,235.859665,298.192078,0.002656
4046,China,2008,earthquake,87564,368412,46587000,414385,47369797,10000000000,300000000,...,4594307000000.0,0.217661,0.00653,1.860825,6.582932,27.696669,3502.341797,31.152851,3561.191162,0.000526
14727,Pakistan,2005,earthquake,73338,128309,0,5000000,5128309,5200000000,0,...,120055300000.0,4.331337,0.0,4.331337,42.058334,73.58345,0.0,2867.431152,2941.014648,0.000573
18845,Turkey,2023,earthquake,50103,107608,9100000,90,9207698,100000000000,0,...,,,,,58.384087,125.393585,10604.05957,0.104875,10729.557617,0.003496


In [10]:
# before concatenating, make sure both have same column names
df = df.drop(columns=['Ranking', 'Earthquake magnitude', 'Additional information'])
df = df.rename(columns={'Year': 'year', 'Estimated death toll': 'total_dead', 'Location':'country'})
df

Unnamed: 0,country,year,total_dead
0,"Shaanxi, China",1556,830000
1,"Port-au-Prince, Haiti",2010,316000
2,"Antakya, Turkey",115,260000
3,"Antakya, Turkey",525,250000
4,"Tangshan, China",1976,242769
5,"Gyzndzha, Azerbaijan",1139,230000
6,"Sumatra, Indonesia",2004,227899
7,"Damghan, Iran",856,200000
8,"Gansu, China",1920,200000
9,"Dvin, Armenia",893,150000


In [11]:
# access country column and remove part of the string before comma to store only country name
df['country'] = df['country'].str.split(',').str[1] # str[0] removes after comma, use str[1] to remove before comma
# convert "total_dead" column to numeric
df['total_dead'] = pd.to_numeric(df['total_dead'])
df

Unnamed: 0,country,year,total_dead
0,China,1556,830000
1,Haiti,2010,316000
2,Turkey,115,260000
3,Turkey,525,250000
4,China,1976,242769
5,Azerbaijan,1139,230000
6,Indonesia,2004,227899
7,Iran,856,200000
8,China,1920,200000
9,Armenia,893,150000


In [12]:

column_list = cleaned_data.columns.tolist()
column_list.remove('year')
column_list.remove('total_dead')
column_list.remove('country')
cleaned_data = cleaned_data.drop(columns=column_list)
cleaned_data



Unnamed: 0,country,year,total_dead
7518,Haiti,2010,222570
9117,Indonesia,2004,165816
4046,China,2008,87564
14727,Pakistan,2005,73338
18845,Turkey,2023,50103


In [13]:
# remove trailing spaces from country column of both dataframes
df['country'] = df['country'].str.strip()
cleaned_data['country'] = cleaned_data['country'].str.strip()

In [24]:
# Now concatenate both dataframes, ignore index and
final_data = pd.concat([df, cleaned_data], ignore_index=True)
# add a label column, first 10 rows have label as 0 and last 5 rows have label as 1
final_data['label'] = [0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1]
# make label column to string type
final_data['label'] = final_data['label'].astype(str)
final_data


Unnamed: 0,country,year,total_dead,label
0,China,1556.0,830000,0
1,Haiti,2010.0,316000,0
2,Turkey,115.0,260000,0
3,Turkey,525.0,250000,0
4,China,1976.0,242769,0
5,Azerbaijan,1139.0,230000,0
6,Indonesia,2004.0,227899,0
7,Iran,856.0,200000,0
8,China,1920.0,200000,0
9,Armenia,893.0,150000,0


## Data Visualization:

In [30]:
# import plotly.express as px

fig = px.bar(final_data, 
             x="total_dead", 
             y=final_data["country"] + ' ' + final_data["year"].astype(str), 
             labels={"country": "Country", "total_dead": "Deaths"}, 
             title="Killer Quakes", 
             color="label", 
             color_discrete_map={'0': 'blue', '1': 'red'},
             category_orders={"label": ["0", "1"]},
             )

# Update legend labels
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')), selector=dict(mode='markers+text'))
fig.update_layout(legend_title_text='', legend=dict(title=''), showlegend=True)
fig.for_each_trace(lambda t: t.update(name="10 Deadliest Quakes in History" if t.name == "0" else "Deadly Since 2000"))

fig.show()
