In [1]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
from bs4 import BeautifulSoup as BS
import re

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Using API to grab data from USGS

Data Dictionary: https://earthquake.usgs.gov/data/comcat/data-eventterms.php

In [3]:
# Send get request
# All recorded earthquakes that occurred in Tennessee since 1900
url = 'https://earthquake.usgs.gov/fdsnws/event/1/query'
params = {'format': 'csv', 
           'starttime': '1900-01-01',
           'minlatitude':'34.730', # Huntsville, AL
           'maxlatitude':'37.002', # Bowling Green, KY 
           'minlongitude':'-90.710', # Jonesboro, AR
           'maxlongitude':'-80.843', # Charlotte, NC
           'starttime' : '1900-01-01',
           'limit':[1,20000]}
r = requests.get(url=url, params=params)
r.text

# Convert to Pandas DF
earthquake_tn = pd.read_csv(StringIO(r.text))
earthquake_tn.head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,id,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,2020-10-23T04:09:18.260Z,36.332833,-89.484667,4.93,1.65,md,24.0,47.0,0.01857,0.03,nm,nm60312902,2020-10-23T16:52:09.410Z,"5 km SSW of Tiptonville, Tennessee",earthquake,0.33,0.27,0.068,14.0,reviewed,nm,nm
1,2020-10-21T21:49:45.720Z,36.0705,-89.802333,6.84,2.14,md,46.0,40.0,0.0643,0.1,nm,nm60312762,2020-10-22T12:49:53.160Z,"2 km NNE of Cooter, Missouri",earthquake,0.17,0.49,0.092,31.0,reviewed,nm,nm
2,2020-10-21T20:28:42.930Z,36.525,-89.569167,6.85,1.69,md,19.0,50.0,0.02596,0.03,nm,nm60312752,2020-10-22T14:45:56.660Z,"3 km E of Marston, Missouri",earthquake,0.34,0.63,0.167,12.0,reviewed,nm,nm
3,2020-10-21T15:38:34.380Z,36.867167,-83.242833,9.37,2.25,md,16.0,98.0,0.375,0.32,se,se60312717,2020-10-21T18:59:51.529Z,"4 km W of Evarts, Kentucky",earthquake,0.59,1.91,0.049,11.0,reviewed,se,se
4,2020-10-21T09:44:44.720Z,36.4865,-81.110667,0.74,-0.67,md,4.0,183.0,0.01516,0.01,se,se60312707,2020-10-21T12:31:57.890Z,"2 km SSE of Sparta, North Carolina",earthquake,0.49,1.78,0.007,4.0,reviewed,se,se


In [4]:
earthquake_tn.shape

(10064, 22)

In [5]:
earthquake_tn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10064 entries, 0 to 10063
Data columns (total 22 columns):
time               10064 non-null object
latitude           10064 non-null float64
longitude          10064 non-null float64
depth              10046 non-null float64
mag                10060 non-null float64
magType            10050 non-null object
nst                6791 non-null float64
gap                6787 non-null float64
dmin               6566 non-null float64
rms                6809 non-null float64
net                10064 non-null object
id                 10064 non-null object
updated            10064 non-null object
place              10064 non-null object
type               10064 non-null object
horizontalError    6850 non-null float64
depthError         6849 non-null float64
magError           6758 non-null float64
magNst             6789 non-null float64
status             10064 non-null object
locationSource     10064 non-null object
magSource          10064 n

In [6]:
# Export: Save raw data to file if we drop some important columns
earthquake_tn.to_csv('../data/earthquake_tn_raw.csv', index = False)

## Rename/Drop columns based on Data Dictionary

In [7]:
# time: set correct type, rename for match to "date"
earthquake_tn["time"] = earthquake_tn["time"].astype("datetime64")
earthquake_tn = earthquake_tn.rename(columns = {"time": "date"})
earthquake_tn.head()

Unnamed: 0,date,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,id,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,2020-10-23 04:09:18.260,36.332833,-89.484667,4.93,1.65,md,24.0,47.0,0.01857,0.03,nm,nm60312902,2020-10-23T16:52:09.410Z,"5 km SSW of Tiptonville, Tennessee",earthquake,0.33,0.27,0.068,14.0,reviewed,nm,nm
1,2020-10-21 21:49:45.720,36.0705,-89.802333,6.84,2.14,md,46.0,40.0,0.0643,0.1,nm,nm60312762,2020-10-22T12:49:53.160Z,"2 km NNE of Cooter, Missouri",earthquake,0.17,0.49,0.092,31.0,reviewed,nm,nm
2,2020-10-21 20:28:42.930,36.525,-89.569167,6.85,1.69,md,19.0,50.0,0.02596,0.03,nm,nm60312752,2020-10-22T14:45:56.660Z,"3 km E of Marston, Missouri",earthquake,0.34,0.63,0.167,12.0,reviewed,nm,nm
3,2020-10-21 15:38:34.380,36.867167,-83.242833,9.37,2.25,md,16.0,98.0,0.375,0.32,se,se60312717,2020-10-21T18:59:51.529Z,"4 km W of Evarts, Kentucky",earthquake,0.59,1.91,0.049,11.0,reviewed,se,se
4,2020-10-21 09:44:44.720,36.4865,-81.110667,0.74,-0.67,md,4.0,183.0,0.01516,0.01,se,se60312707,2020-10-21T12:31:57.890Z,"2 km SSE of Sparta, North Carolina",earthquake,0.49,1.78,0.007,4.0,reviewed,se,se


In [8]:
# Drop non-necessary columns
earthquake_tn = earthquake_tn.drop(columns = ["magType", "nst", "gap", "dmin", "rms", "id", "magNst", "locationSource", "magSource"])
earthquake_tn.head()

Unnamed: 0,date,latitude,longitude,depth,mag,net,updated,place,type,horizontalError,depthError,magError,status
0,2020-10-23 04:09:18.260,36.332833,-89.484667,4.93,1.65,nm,2020-10-23T16:52:09.410Z,"5 km SSW of Tiptonville, Tennessee",earthquake,0.33,0.27,0.068,reviewed
1,2020-10-21 21:49:45.720,36.0705,-89.802333,6.84,2.14,nm,2020-10-22T12:49:53.160Z,"2 km NNE of Cooter, Missouri",earthquake,0.17,0.49,0.092,reviewed
2,2020-10-21 20:28:42.930,36.525,-89.569167,6.85,1.69,nm,2020-10-22T14:45:56.660Z,"3 km E of Marston, Missouri",earthquake,0.34,0.63,0.167,reviewed
3,2020-10-21 15:38:34.380,36.867167,-83.242833,9.37,2.25,se,2020-10-21T18:59:51.529Z,"4 km W of Evarts, Kentucky",earthquake,0.59,1.91,0.049,reviewed
4,2020-10-21 09:44:44.720,36.4865,-81.110667,0.74,-0.67,se,2020-10-21T12:31:57.890Z,"2 km SSE of Sparta, North Carolina",earthquake,0.49,1.78,0.007,reviewed


In [9]:
earthquake_tn.shape

(10064, 13)

In [10]:
# Rename columns
earthquake_tn.columns = ["date","latitude","longitude",
                         "depth","magnitude", "news_source", 
                         "latest_updated", "place", "seismic_type", 
                         "location_error", "depth_error",
                         "magnitude_error","review_status"]
earthquake_tn.head()

Unnamed: 0,date,latitude,longitude,depth,magnitude,news_source,latest_updated,place,seismic_type,location_error,depth_error,magnitude_error,review_status
0,2020-10-23 04:09:18.260,36.332833,-89.484667,4.93,1.65,nm,2020-10-23T16:52:09.410Z,"5 km SSW of Tiptonville, Tennessee",earthquake,0.33,0.27,0.068,reviewed
1,2020-10-21 21:49:45.720,36.0705,-89.802333,6.84,2.14,nm,2020-10-22T12:49:53.160Z,"2 km NNE of Cooter, Missouri",earthquake,0.17,0.49,0.092,reviewed
2,2020-10-21 20:28:42.930,36.525,-89.569167,6.85,1.69,nm,2020-10-22T14:45:56.660Z,"3 km E of Marston, Missouri",earthquake,0.34,0.63,0.167,reviewed
3,2020-10-21 15:38:34.380,36.867167,-83.242833,9.37,2.25,se,2020-10-21T18:59:51.529Z,"4 km W of Evarts, Kentucky",earthquake,0.59,1.91,0.049,reviewed
4,2020-10-21 09:44:44.720,36.4865,-81.110667,0.74,-0.67,se,2020-10-21T12:31:57.890Z,"2 km SSE of Sparta, North Carolina",earthquake,0.49,1.78,0.007,reviewed


- time => date --> Break down to year, month, day, time
- latitude
- longitude
- depth
- mag => magnitude
- magType => drop
- nst => drop
- gap => drop
- dmin => drop
- rms => drop
- net => news_source
- id => drop
- updated => latest_updated
- place => Cleanup to closest_place and state
- type => seismic_type
- horizontalError => location_error
- depthError => depth_error
- magError => magnitude_error
- magNst => drop
- status => review_status
- locationSource => drop
- magSource => drop

## Format Date

Now, let's extract each parts: `year`, `month`, `day`, `time`, and `timestamp`

In [11]:
earthquake_tn['year'] = earthquake_tn['date'].dt.year
earthquake_tn['month'] = earthquake_tn['date'].dt.month
earthquake_tn['day'] = earthquake_tn['date'].dt.day
earthquake_tn['time'] = earthquake_tn['date'].dt.time

earthquake_tn.head()

Unnamed: 0,date,latitude,longitude,depth,magnitude,news_source,latest_updated,place,seismic_type,location_error,depth_error,magnitude_error,review_status,year,month,day,time
0,2020-10-23 04:09:18.260,36.332833,-89.484667,4.93,1.65,nm,2020-10-23T16:52:09.410Z,"5 km SSW of Tiptonville, Tennessee",earthquake,0.33,0.27,0.068,reviewed,2020,10,23,04:09:18.260000
1,2020-10-21 21:49:45.720,36.0705,-89.802333,6.84,2.14,nm,2020-10-22T12:49:53.160Z,"2 km NNE of Cooter, Missouri",earthquake,0.17,0.49,0.092,reviewed,2020,10,21,21:49:45.720000
2,2020-10-21 20:28:42.930,36.525,-89.569167,6.85,1.69,nm,2020-10-22T14:45:56.660Z,"3 km E of Marston, Missouri",earthquake,0.34,0.63,0.167,reviewed,2020,10,21,20:28:42.930000
3,2020-10-21 15:38:34.380,36.867167,-83.242833,9.37,2.25,se,2020-10-21T18:59:51.529Z,"4 km W of Evarts, Kentucky",earthquake,0.59,1.91,0.049,reviewed,2020,10,21,15:38:34.380000
4,2020-10-21 09:44:44.720,36.4865,-81.110667,0.74,-0.67,se,2020-10-21T12:31:57.890Z,"2 km SSE of Sparta, North Carolina",earthquake,0.49,1.78,0.007,reviewed,2020,10,21,09:44:44.720000


## Cleanup `place`

### 1/ extract state

In [12]:
# Make a function to extract last element from place column
def split_place_to_state(st):
    # if the word is not found it will return the original string aka test 2 since there is no ","
    ls = st.split(',')
    # ls will be either ['2 km SSE of Sparta', ' North Carolina'] or ['Arkansas']
    if len(ls) == 2:
        state = ls[1].strip()
        
    else: #len(ls) == 1
        state = ls[0].strip()
        
          
    return state

In [13]:
# aply it to the dataframe
earthquake_tn["state"] = earthquake_tn["place"].apply(split_place_to_state)
earthquake_tn.head()

Unnamed: 0,date,latitude,longitude,depth,magnitude,news_source,latest_updated,place,seismic_type,location_error,depth_error,magnitude_error,review_status,year,month,day,time,state
0,2020-10-23 04:09:18.260,36.332833,-89.484667,4.93,1.65,nm,2020-10-23T16:52:09.410Z,"5 km SSW of Tiptonville, Tennessee",earthquake,0.33,0.27,0.068,reviewed,2020,10,23,04:09:18.260000,Tennessee
1,2020-10-21 21:49:45.720,36.0705,-89.802333,6.84,2.14,nm,2020-10-22T12:49:53.160Z,"2 km NNE of Cooter, Missouri",earthquake,0.17,0.49,0.092,reviewed,2020,10,21,21:49:45.720000,Missouri
2,2020-10-21 20:28:42.930,36.525,-89.569167,6.85,1.69,nm,2020-10-22T14:45:56.660Z,"3 km E of Marston, Missouri",earthquake,0.34,0.63,0.167,reviewed,2020,10,21,20:28:42.930000,Missouri
3,2020-10-21 15:38:34.380,36.867167,-83.242833,9.37,2.25,se,2020-10-21T18:59:51.529Z,"4 km W of Evarts, Kentucky",earthquake,0.59,1.91,0.049,reviewed,2020,10,21,15:38:34.380000,Kentucky
4,2020-10-21 09:44:44.720,36.4865,-81.110667,0.74,-0.67,se,2020-10-21T12:31:57.890Z,"2 km SSE of Sparta, North Carolina",earthquake,0.49,1.78,0.007,reviewed,2020,10,21,09:44:44.720000,North Carolina


In [14]:
# We only want to keep Tennessee
earthquake_tn = earthquake_tn[earthquake_tn.state.str.contains("Tennessee")]
earthquake_tn.head()

Unnamed: 0,date,latitude,longitude,depth,magnitude,news_source,latest_updated,place,seismic_type,location_error,depth_error,magnitude_error,review_status,year,month,day,time,state
0,2020-10-23 04:09:18.260,36.332833,-89.484667,4.93,1.65,nm,2020-10-23T16:52:09.410Z,"5 km SSW of Tiptonville, Tennessee",earthquake,0.33,0.27,0.068,reviewed,2020,10,23,04:09:18.260000,Tennessee
6,2020-10-21 03:00:08.800,36.313667,-89.536167,9.5,1.23,nm,2020-10-21T13:18:24.430Z,"7 km NW of Ridgely, Tennessee",earthquake,0.68,0.51,0.032,reviewed,2020,10,21,03:00:08.800000,Tennessee
8,2020-10-20 18:50:21.800,36.2675,-89.523333,8.84,2.33,nm,2020-10-21T12:29:35.220Z,"3 km W of Ridgely, Tennessee",earthquake,0.15,0.3,0.142,reviewed,2020,10,20,18:50:21.800000,Tennessee
11,2020-10-19 20:19:38.490,36.224,-82.817167,17.27,2.31,se,2020-10-23T06:52:55.336Z,"6 km N of Greeneville, Tennessee",earthquake,0.91,1.69,0.099,reviewed,2020,10,19,20:19:38.490000,Tennessee
12,2020-10-19 05:15:25.090,36.2095,-89.476167,6.33,1.59,nm,2020-10-19T13:03:45.780Z,"6 km S of Ridgely, Tennessee",earthquake,0.26,0.56,0.086,reviewed,2020,10,19,05:15:25.090000,Tennessee


### 2/ extract city:

In [15]:
# Make a function to extract the middle element from place column
def split_place_to_city(st):
    #use regex to extract the city and use try and except if in some rows there is no city aka [eastern Tennessee]
    try:
        city = re.findall(r"of\s(.+),", st)[0] 
        
        
    except:
        city = None
                
    return city 


In [16]:
earthquake_tn["city"] = earthquake_tn["place"].apply(str).apply(split_place_to_city)
earthquake_tn.head()

Unnamed: 0,date,latitude,longitude,depth,magnitude,news_source,latest_updated,place,seismic_type,location_error,depth_error,magnitude_error,review_status,year,month,day,time,state,city
0,2020-10-23 04:09:18.260,36.332833,-89.484667,4.93,1.65,nm,2020-10-23T16:52:09.410Z,"5 km SSW of Tiptonville, Tennessee",earthquake,0.33,0.27,0.068,reviewed,2020,10,23,04:09:18.260000,Tennessee,Tiptonville
6,2020-10-21 03:00:08.800,36.313667,-89.536167,9.5,1.23,nm,2020-10-21T13:18:24.430Z,"7 km NW of Ridgely, Tennessee",earthquake,0.68,0.51,0.032,reviewed,2020,10,21,03:00:08.800000,Tennessee,Ridgely
8,2020-10-20 18:50:21.800,36.2675,-89.523333,8.84,2.33,nm,2020-10-21T12:29:35.220Z,"3 km W of Ridgely, Tennessee",earthquake,0.15,0.3,0.142,reviewed,2020,10,20,18:50:21.800000,Tennessee,Ridgely
11,2020-10-19 20:19:38.490,36.224,-82.817167,17.27,2.31,se,2020-10-23T06:52:55.336Z,"6 km N of Greeneville, Tennessee",earthquake,0.91,1.69,0.099,reviewed,2020,10,19,20:19:38.490000,Tennessee,Greeneville
12,2020-10-19 05:15:25.090,36.2095,-89.476167,6.33,1.59,nm,2020-10-19T13:03:45.780Z,"6 km S of Ridgely, Tennessee",earthquake,0.26,0.56,0.086,reviewed,2020,10,19,05:15:25.090000,Tennessee,Ridgely


In [17]:
earthquake_tn[earthquake_tn.city.isna()]["place"].value_counts()

eastern Tennessee                         45
Tennessee                                 18
Tennessee-North Carolina border region     2
Tennessee-Virginia border region           1
Name: place, dtype: int64

In [18]:
# Export: Save cleaned data to file 
earthquake_tn.to_csv('../data/earthquake_tn_01_clean.csv', index = False)