In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
world_earthquakes = pd.read_csv('../data/world_earthquakes_05_clean.csv')

In [3]:
world_earthquakes.head()

Unnamed: 0,date,country,latitude,longitude,depth,magnitude,secondary_effects,pde_shaking_deaths,pde_total_deaths,utsu_total_deaths,em_dat_total_deaths,others_source_deaths,year,month,day,time
0,1900-05-11 17:23:00,Japan,38.7,141.1,5.0,7.0,,0,0,0,0,0,1900,5,11,17:23:00
1,1900-07-12 06:25:00,Turkey,40.3,43.1,,5.9,,0,0,140,0,0,1900,7,12,06:25:00
2,1900-10-29 09:11:00,Venezuela,11.0,-66.0,0.0,7.7,,0,0,0,0,0,1900,10,29,09:11:00
3,1901-02-15 00:00:00,China,26.0,100.1,0.0,6.5,,0,0,0,0,0,1901,2,15,00:00:00
4,1901-03-31 07:11:00,Bulgaria,43.4,28.7,,6.4,,0,0,4,0,0,1901,3,31,07:11:00


## 1/ Checking that new cleanings are there (country, magnitude and secondary effects):

In [4]:
world_earthquakes.iloc[[1033]]

Unnamed: 0,date,country,latitude,longitude,depth,magnitude,secondary_effects,pde_shaking_deaths,pde_total_deaths,utsu_total_deaths,em_dat_total_deaths,others_source_deaths,year,month,day,time
1033,1997-07-09 19:24:00,Venezuela,10.45,-63.532,10.0,6.9,landslide,81,81,81,80,0,1997,7,9,19:24:00


In [5]:
world_earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 16 columns):
date                    1340 non-null object
country                 1340 non-null object
latitude                1325 non-null float64
longitude               1325 non-null float64
depth                   1250 non-null float64
magnitude               1339 non-null float64
secondary_effects       373 non-null object
pde_shaking_deaths      1340 non-null int64
pde_total_deaths        1340 non-null int64
utsu_total_deaths       1340 non-null int64
em_dat_total_deaths     1340 non-null int64
others_source_deaths    1340 non-null int64
year                    1340 non-null int64
month                   1340 non-null int64
day                     1340 non-null int64
time                    1340 non-null object
dtypes: float64(4), int64(8), object(4)
memory usage: 167.6+ KB


In [6]:
world_earthquakes.secondary_effects.unique()

array([nan, 'tsunami', 'fire', 'landslide', 'tsunami, fire',
       'tsunami, landslide', 'landslide, liquification',
       'tsunami,liquification', 'tsunami, landslide, liquification',
       'liquification', 'tsunami,landslide,fire',
       'landslide,fire,liquification', 'landslide,fire'], dtype=object)

## 2/ Making a new column taking in account the highest number of death from all deaths columns:

In [7]:
world_earthquakes['deaths'] = world_earthquakes[["pde_shaking_deaths", "pde_total_deaths", "utsu_total_deaths", "em_dat_total_deaths", "others_source_deaths"]].max(axis=1)

In [8]:
world_earthquakes.head()

Unnamed: 0,date,country,latitude,longitude,depth,magnitude,secondary_effects,pde_shaking_deaths,pde_total_deaths,utsu_total_deaths,em_dat_total_deaths,others_source_deaths,year,month,day,time,deaths
0,1900-05-11 17:23:00,Japan,38.7,141.1,5.0,7.0,,0,0,0,0,0,1900,5,11,17:23:00,0
1,1900-07-12 06:25:00,Turkey,40.3,43.1,,5.9,,0,0,140,0,0,1900,7,12,06:25:00,140
2,1900-10-29 09:11:00,Venezuela,11.0,-66.0,0.0,7.7,,0,0,0,0,0,1900,10,29,09:11:00,0
3,1901-02-15 00:00:00,China,26.0,100.1,0.0,6.5,,0,0,0,0,0,1901,2,15,00:00:00,0
4,1901-03-31 07:11:00,Bulgaria,43.4,28.7,,6.4,,0,0,4,0,0,1901,3,31,07:11:00,4


## 3/ Remove the 5 deaths columns (since we did a total death, we don't need them anymore:

In [9]:
world_earthquakes = world_earthquakes.drop(columns = ["pde_shaking_deaths", "pde_total_deaths", "utsu_total_deaths", "em_dat_total_deaths", "others_source_deaths"], axis=1)
world_earthquakes.head()

Unnamed: 0,date,country,latitude,longitude,depth,magnitude,secondary_effects,year,month,day,time,deaths
0,1900-05-11 17:23:00,Japan,38.7,141.1,5.0,7.0,,1900,5,11,17:23:00,0
1,1900-07-12 06:25:00,Turkey,40.3,43.1,,5.9,,1900,7,12,06:25:00,140
2,1900-10-29 09:11:00,Venezuela,11.0,-66.0,0.0,7.7,,1900,10,29,09:11:00,0
3,1901-02-15 00:00:00,China,26.0,100.1,0.0,6.5,,1901,2,15,00:00:00,0
4,1901-03-31 07:11:00,Bulgaria,43.4,28.7,,6.4,,1901,3,31,07:11:00,4


## 4/ Put year, month, day and time columns next to date column:

In [10]:
world_earthquakes = world_earthquakes[["date", "year", "month", "day", "time", "country", "latitude", "longitude", "depth", "magnitude", "secondary_effects", "deaths"]]
world_earthquakes.head()

Unnamed: 0,date,year,month,day,time,country,latitude,longitude,depth,magnitude,secondary_effects,deaths
0,1900-05-11 17:23:00,1900,5,11,17:23:00,Japan,38.7,141.1,5.0,7.0,,0
1,1900-07-12 06:25:00,1900,7,12,06:25:00,Turkey,40.3,43.1,,5.9,,140
2,1900-10-29 09:11:00,1900,10,29,09:11:00,Venezuela,11.0,-66.0,0.0,7.7,,0
3,1901-02-15 00:00:00,1901,2,15,00:00:00,China,26.0,100.1,0.0,6.5,,0
4,1901-03-31 07:11:00,1901,3,31,07:11:00,Bulgaria,43.4,28.7,,6.4,,4


## 5/ Exporting:

In [11]:
# Export
world_earthquakes.to_csv('../data/world_earthquakes_06_clean.csv', index = False)