In [1]:
import numpy as np

data = np.genfromtxt(
    'data/example_data.csv', delimiter=';', 
    names=True, dtype=None, encoding='UTF'
)
data

array([('2018-10-13 11:10:23.560', '262km NW of Ozernovskiy, Russia', 'mww', 6.7, 'green', 1),
       ('2018-10-13 04:34:15.580', '25km E of Bitung, Indonesia', 'mww', 5.2, 'green', 0),
       ('2018-10-13 00:13:46.220', '42km WNW of Sola, Vanuatu', 'mww', 5.7, 'green', 0),
       ('2018-10-12 21:09:49.240', '13km E of Nueva Concepcion, Guatemala', 'mww', 5.7, 'green', 0),
       ('2018-10-12 02:52:03.620', '128km SE of Kimbe, Papua New Guinea', 'mww', 5.6, 'green', 1)],
      dtype=[('time', '<U23'), ('place', '<U37'), ('magType', '<U3'), ('mag', '<f8'), ('alert', '<U5'), ('tsunami', '<i4')])

In [2]:
data.shape

(5,)

In [3]:
data.dtype

dtype([('time', '<U23'), ('place', '<U37'), ('magType', '<U3'), ('mag', '<f8'), ('alert', '<U5'), ('tsunami', '<i4')])

In [4]:
%%timeit
max([row[3] for row in data])

4.12 µs ± 1.06 µs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [5]:
array_dict = {
    col: np.array([row[i] for row in data])
    for i, col in enumerate(data.dtype.names)
}
array_dict

{'time': array(['2018-10-13 11:10:23.560', '2018-10-13 04:34:15.580',
        '2018-10-13 00:13:46.220', '2018-10-12 21:09:49.240',
        '2018-10-12 02:52:03.620'], dtype='<U23'),
 'place': array(['262km NW of Ozernovskiy, Russia', '25km E of Bitung, Indonesia',
        '42km WNW of Sola, Vanuatu',
        '13km E of Nueva Concepcion, Guatemala',
        '128km SE of Kimbe, Papua New Guinea'], dtype='<U37'),
 'magType': array(['mww', 'mww', 'mww', 'mww', 'mww'], dtype='<U3'),
 'mag': array([6.7, 5.2, 5.7, 5.7, 5.6]),
 'alert': array(['green', 'green', 'green', 'green', 'green'], dtype='<U5'),
 'tsunami': array([1, 0, 0, 0, 1])}

In [6]:
%%timeit
array_dict['mag'].max()

2.33 µs ± 439 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [7]:
np.array([
    value[array_dict['mag'].argmax()] 
    for key, value in array_dict.items()
])

array(['2018-10-13 11:10:23.560', '262km NW of Ozernovskiy, Russia',
       'mww', '6.7', 'green', '1'], dtype='<U31')

In [None]:
#The previous cells illustrate how it's possible to run operations using numpy, but it indicates there may be a better way.

In [8]:
import pandas as pd

place = pd.Series(array_dict['place'], name='place')
place

0          262km NW of Ozernovskiy, Russia
1              25km E of Bitung, Indonesia
2                42km WNW of Sola, Vanuatu
3    13km E of Nueva Concepcion, Guatemala
4      128km SE of Kimbe, Papua New Guinea
Name: place, dtype: object

In [9]:
place.name

'place'

In [10]:
place.dtype

dtype('O')

In [11]:
place.shape

(5,)

In [12]:
place.values

array(['262km NW of Ozernovskiy, Russia', '25km E of Bitung, Indonesia',
       '42km WNW of Sola, Vanuatu',
       '13km E of Nueva Concepcion, Guatemala',
       '128km SE of Kimbe, Papua New Guinea'], dtype=object)

In [13]:
place_index = place.index
place_index

RangeIndex(start=0, stop=5, step=1)

In [14]:
place_index.values

array([0, 1, 2, 3, 4], dtype=int64)

In [15]:
place_index.dtype

dtype('int64')

In [16]:
place_index.shape

(5,)

In [17]:
place_index.is_unique

True

In [18]:
np.array([1, 1, 1]) + np.array([-1, 0, 1])

array([0, 1, 2])

In [19]:
numbers = np.linspace(0, 10, num=5) # makes numpy array([0, 2.5, 5, 7.5, 10])
x = pd.Series(numbers) # index is [0, 1, 2, 3, 4]
y = pd.Series(numbers, index=pd.Index([1, 2, 3, 4, 5]))
x + y

0     NaN
1     2.5
2     7.5
3    12.5
4    17.5
5     NaN
dtype: float64

In [None]:
#The next cells introduce the concept of Series in the Pandas library and illustrates how many operations are performed with shorter commands.

In [20]:
df = pd.DataFrame(array_dict) 

# this will also work with the first representation
# df = pd.DataFrame(data)

df

Unnamed: 0,time,place,magType,mag,alert,tsunami
0,2018-10-13 11:10:23.560,"262km NW of Ozernovskiy, Russia",mww,6.7,green,1
1,2018-10-13 04:34:15.580,"25km E of Bitung, Indonesia",mww,5.2,green,0
2,2018-10-13 00:13:46.220,"42km WNW of Sola, Vanuatu",mww,5.7,green,0
3,2018-10-12 21:09:49.240,"13km E of Nueva Concepcion, Guatemala",mww,5.7,green,0
4,2018-10-12 02:52:03.620,"128km SE of Kimbe, Papua New Guinea",mww,5.6,green,1


In [21]:
df.dtypes

time        object
place       object
magType     object
mag        float64
alert       object
tsunami      int32
dtype: object

In [22]:
df.values

array([['2018-10-13 11:10:23.560', '262km NW of Ozernovskiy, Russia',
        'mww', 6.7, 'green', 1],
       ['2018-10-13 04:34:15.580', '25km E of Bitung, Indonesia', 'mww',
        5.2, 'green', 0],
       ['2018-10-13 00:13:46.220', '42km WNW of Sola, Vanuatu', 'mww',
        5.7, 'green', 0],
       ['2018-10-12 21:09:49.240',
        '13km E of Nueva Concepcion, Guatemala', 'mww', 5.7, 'green', 0],
       ['2018-10-12 02:52:03.620', '128km SE of Kimbe, Papua New Guinea',
        'mww', 5.6, 'green', 1]], dtype=object)

In [23]:
df.columns

Index(['time', 'place', 'magType', 'mag', 'alert', 'tsunami'], dtype='object')

In [24]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [25]:
df.shape

(5, 6)

In [26]:
df + df

Unnamed: 0,time,place,magType,mag,alert,tsunami
0,2018-10-13 11:10:23.5602018-10-13 11:10:23.560,"262km NW of Ozernovskiy, Russia262km NW of Oze...",mwwmww,13.4,greengreen,2
1,2018-10-13 04:34:15.5802018-10-13 04:34:15.580,"25km E of Bitung, Indonesia25km E of Bitung, I...",mwwmww,10.4,greengreen,0
2,2018-10-13 00:13:46.2202018-10-13 00:13:46.220,"42km WNW of Sola, Vanuatu42km WNW of Sola, Van...",mwwmww,11.4,greengreen,0
3,2018-10-12 21:09:49.2402018-10-12 21:09:49.240,"13km E of Nueva Concepcion, Guatemala13km E of...",mwwmww,11.4,greengreen,0
4,2018-10-12 02:52:03.6202018-10-12 02:52:03.620,"128km SE of Kimbe, Papua New Guinea128km SE of...",mwwmww,11.2,greengreen,2


In [None]:
#The rest of this section introduces the concept of DataFrames. These are essentially groups of series that make up a table.

In [27]:
import datetime as dt
import numpy as np
import pandas as pd

In [28]:
np.random.seed(0) # set a seed for reproducibility
pd.Series(np.random.rand(5), name='random')

0    0.548814
1    0.715189
2    0.602763
3    0.544883
4    0.423655
Name: random, dtype: float64

In [29]:
pd.Series(np.linspace(0, 10, num=5)).to_frame()

Unnamed: 0,0
0,0.0
1,2.5
2,5.0
3,7.5
4,10.0


In [30]:
np.random.seed(0) # set seed so result is reproducible
pd.DataFrame(
    {
        'random': np.random.rand(5),
        'text': ['hot', 'warm', 'cool', 'cold', None],
        'truth': [np.random.choice([True, False]) for _ in range(5)]
    }, 
    index=pd.date_range(
        end=dt.date(2019, 4, 21),
        freq='1D',
        periods=5, 
        name='date'
    )
)

Unnamed: 0_level_0,random,text,truth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-04-17,0.548814,hot,False
2019-04-18,0.715189,warm,True
2019-04-19,0.602763,cool,True
2019-04-20,0.544883,cold,False
2019-04-21,0.423655,,True


In [31]:
pd.DataFrame([
    {'mag': 5.2, 'place': 'California'},
    {'mag': 1.2, 'place': 'Alaska'},
    {'mag': 0.2, 'place': 'California'},
])

Unnamed: 0,mag,place
0,5.2,California
1,1.2,Alaska
2,0.2,California


In [32]:
list_of_tuples = [(n, n**2, n**3) for n in range(5)]
list_of_tuples

[(0, 0, 0), (1, 1, 1), (2, 4, 8), (3, 9, 27), (4, 16, 64)]

In [33]:
pd.DataFrame(
    list_of_tuples, 
    columns=['n', 'n_squared', 'n_cubed']
)

Unnamed: 0,n,n_squared,n_cubed
0,0,0,0
1,1,1,1
2,2,4,8
3,3,9,27
4,4,16,64


In [34]:
pd.DataFrame(
    np.array([
        [0, 0, 0],
        [1, 1, 1],
        [2, 4, 8],
        [3, 9, 27],
        [4, 16, 64]
    ]), columns=['n', 'n_squared', 'n_cubed']
)

Unnamed: 0,n,n_squared,n_cubed
0,0,0,0
1,1,1,1
2,2,4,8
3,3,9,27
4,4,16,64


In [35]:
!wc -l data/earthquakes.csv

9333 data/earthquakes.csv


In [36]:
!ls -lh data | grep earthquakes.csv

-rw-r--r-- 1 marc 197121 3.4M Aug 18 22:17 earthquakes.csv


In [37]:
files = !ls -lh data
[file for file in files if 'earthquake' in file]

['-rw-r--r-- 1 marc 197121 3.4M Aug 18 22:17 earthquakes.csv']

In [38]:
!head -n 2 data/earthquakes.csv

alert,cdi,code,detail,dmin,felt,gap,ids,mag,magType,mmi,net,nst,place,rms,sig,sources,status,time,title,tsunami,type,types,tz,updated,url
,,37389218,https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ci37389218&format=geojson,0.008693,,85.0,",ci37389218,",1.35,ml,,ci,26.0,"9km NE of Aguanga, CA",0.19,28,",ci,",automatic,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475395144,https://earthquake.usgs.gov/earthquakes/eventpage/ci37389218


In [39]:
!tail -n 1 data/earthquakes.csv

,,38063935,https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ci38063935&format=geojson,0.01698,,39.0,",ci38063935,",0.66,ml,,ci,24.0,"9km NE of Aguanga, CA",0.1,7,",ci,",reviewed,1537228864470,"M 0.7 - 9km NE of Aguanga, CA",0,earthquake,",focal-mechanism,geoserve,nearby-cities,origin,phase-data,scitech-link,",-480.0,1537305830770,https://earthquake.usgs.gov/earthquakes/eventpage/ci38063935


In [40]:
!awk -F',' '{print NF; exit}' data/earthquakes.csv

26


In [41]:
headers = !head -n 1 data/earthquakes.csv
len(headers[0].split(','))

26

In [42]:
df = pd.read_csv('data/earthquakes.csv')

In [43]:
df = pd.read_csv(
    'https://github.com/stefmolin/'
    'Hands-On-Data-Analysis-with-Pandas-2nd-edition'
    '/blob/master/ch_02/data/earthquakes.csv?raw=True'
)

In [44]:
df.to_csv('output.csv', index=False)

In [45]:
import sqlite3

with sqlite3.connect('data/quakes.db') as connection:
    pd.read_csv('data/tsunamis.csv').to_sql(
        'tsunamis', connection, index=False, if_exists='replace'
    )

In [46]:
import sqlite3

with sqlite3.connect('data/quakes.db') as connection:
    tsunamis = pd.read_sql('SELECT * FROM tsunamis', connection)

tsunamis.head()

Unnamed: 0,alert,type,title,place,magType,mag,time
0,,earthquake,"M 5.0 - 165km NNW of Flying Fish Cove, Christm...","165km NNW of Flying Fish Cove, Christmas Island",mww,5.0,1539459504090
1,green,earthquake,"M 6.7 - 262km NW of Ozernovskiy, Russia","262km NW of Ozernovskiy, Russia",mww,6.7,1539429023560
2,green,earthquake,"M 5.6 - 128km SE of Kimbe, Papua New Guinea","128km SE of Kimbe, Papua New Guinea",mww,5.6,1539312723620
3,green,earthquake,"M 6.5 - 148km S of Severo-Kuril'sk, Russia","148km S of Severo-Kuril'sk, Russia",mww,6.5,1539213362130
4,green,earthquake,"M 6.2 - 94km SW of Kokopo, Papua New Guinea","94km SW of Kokopo, Papua New Guinea",mww,6.2,1539208835130


In [None]:
#The previous cells showed how to create a DataFrame using a few methods including pulling from csv and url. It also showed how to import using SQL comands.

In [47]:
import datetime as dt
import pandas as pd
import requests

yesterday = dt.date.today() - dt.timedelta(days=1)
api = 'https://earthquake.usgs.gov/fdsnws/event/1/query'
payload = {
    'format': 'geojson',
    'starttime': yesterday - dt.timedelta(days=30),
    'endtime': yesterday
}
response = requests.get(api, params=payload)

# let's make sure the request was OK
response.status_code

200

In [48]:
earthquake_json = response.json()
earthquake_json.keys()

dict_keys(['type', 'metadata', 'features', 'bbox'])

In [49]:
earthquake_json['metadata']

{'generated': 1725126023000,
 'url': 'https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=2024-07-31&endtime=2024-08-30',
 'title': 'USGS Earthquakes',
 'status': 200,
 'api': '1.14.1',
 'count': 9918}

In [50]:
type(earthquake_json['features'])

list

In [51]:
earthquake_json['features'][0]

{'type': 'Feature',
 'properties': {'mag': 1.3,
  'place': '39 km N of Valdez, Alaska',
  'time': 1724975989344,
  'updated': 1724976075079,
  'tz': None,
  'url': 'https://earthquake.usgs.gov/earthquakes/eventpage/ak024b4m3ph2',
  'detail': 'https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ak024b4m3ph2&format=geojson',
  'felt': None,
  'cdi': None,
  'mmi': None,
  'alert': None,
  'status': 'automatic',
  'tsunami': 0,
  'sig': 26,
  'net': 'ak',
  'code': '024b4m3ph2',
  'ids': ',ak024b4m3ph2,',
  'sources': ',ak,',
  'types': ',origin,phase-data,',
  'nst': None,
  'dmin': None,
  'rms': 0.26,
  'gap': None,
  'magType': 'ml',
  'type': 'earthquake',
  'title': 'M 1.3 - 39 km N of Valdez, Alaska'},
 'geometry': {'type': 'Point', 'coordinates': [-146.4831, 61.4815, 32.1]},
 'id': 'ak024b4m3ph2'}

In [52]:
earthquake_properties_data = [
    quake['properties'] for quake in earthquake_json['features']
]
df = pd.DataFrame(earthquake_properties_data)
df.head()

Unnamed: 0,mag,place,time,updated,tz,url,detail,felt,cdi,mmi,...,ids,sources,types,nst,dmin,rms,gap,magType,type,title
0,1.3,"39 km N of Valdez, Alaska",1724975989344,1724976075079,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",ak024b4m3ph2,",",ak,",",origin,phase-data,",,,0.26,,ml,earthquake,"M 1.3 - 39 km N of Valdez, Alaska"
1,1.94,"6 km SSW of Pāhala, Hawaii",1724975340200,1724975432020,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",hv74430752,",",hv,",",origin,phase-data,",15.0,0.1031,0.16,231.0,md,earthquake,"M 1.9 - 6 km SSW of Pāhala, Hawaii"
2,0.7,"18 km ESE of Anza, CA",1724974870130,1725023751311,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",ci40719471,",",ci,",",nearby-cities,origin,phase-data,scitech-link,",23.0,0.06188,0.12,60.0,ml,earthquake,"M 0.7 - 18 km ESE of Anza, CA"
3,1.14,"6 km WNW of Cobb, CA",1724974358680,1724980036557,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",nc75055081,",",nc,",",nearby-cities,origin,phase-data,scitech-link,",17.0,0.00314,0.01,67.0,md,earthquake,"M 1.1 - 6 km WNW of Cobb, CA"
4,1.2,"8 km ENE of Goldfield, Nevada",1724974241410,1724976718761,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",nn00883437,",",nn,",",origin,phase-data,",14.0,0.19,0.144,93.8,ml,earthquake,"M 1.2 - 8 km ENE of Goldfield, Nevada"


In [53]:
df.to_csv('earthquakes.csv', index=False)

In [None]:
#The previous cells showed how to pull data from an API and use that to create a DataFrame using Pandas.

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('data/earthquakes.csv')

In [54]:
df.empty

False

In [55]:
df.shape

(9918, 26)

In [56]:
df.columns

Index(['mag', 'place', 'time', 'updated', 'tz', 'url', 'detail', 'felt', 'cdi',
       'mmi', 'alert', 'status', 'tsunami', 'sig', 'net', 'code', 'ids',
       'sources', 'types', 'nst', 'dmin', 'rms', 'gap', 'magType', 'type',
       'title'],
      dtype='object')

In [57]:
df.head()

Unnamed: 0,mag,place,time,updated,tz,url,detail,felt,cdi,mmi,...,ids,sources,types,nst,dmin,rms,gap,magType,type,title
0,1.3,"39 km N of Valdez, Alaska",1724975989344,1724976075079,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",ak024b4m3ph2,",",ak,",",origin,phase-data,",,,0.26,,ml,earthquake,"M 1.3 - 39 km N of Valdez, Alaska"
1,1.94,"6 km SSW of Pāhala, Hawaii",1724975340200,1724975432020,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",hv74430752,",",hv,",",origin,phase-data,",15.0,0.1031,0.16,231.0,md,earthquake,"M 1.9 - 6 km SSW of Pāhala, Hawaii"
2,0.7,"18 km ESE of Anza, CA",1724974870130,1725023751311,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",ci40719471,",",ci,",",nearby-cities,origin,phase-data,scitech-link,",23.0,0.06188,0.12,60.0,ml,earthquake,"M 0.7 - 18 km ESE of Anza, CA"
3,1.14,"6 km WNW of Cobb, CA",1724974358680,1724980036557,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",nc75055081,",",nc,",",nearby-cities,origin,phase-data,scitech-link,",17.0,0.00314,0.01,67.0,md,earthquake,"M 1.1 - 6 km WNW of Cobb, CA"
4,1.2,"8 km ENE of Goldfield, Nevada",1724974241410,1724976718761,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",nn00883437,",",nn,",",origin,phase-data,",14.0,0.19,0.144,93.8,ml,earthquake,"M 1.2 - 8 km ENE of Goldfield, Nevada"


In [58]:
df.tail(2)

Unnamed: 0,mag,place,time,updated,tz,url,detail,felt,cdi,mmi,...,ids,sources,types,nst,dmin,rms,gap,magType,type,title
9916,0.68,"9 km SSW of Idyllwild, CA",1722384315870,1722385429908,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",ci40677231,",",ci,",",nearby-cities,origin,phase-data,scitech-link,",24.0,0.05325,0.13,106.0,ml,earthquake,"M 0.7 - 9 km SSW of Idyllwild, CA"
9917,0.42,"2 km NNW of The Geysers, CA",1722384012050,1724104840895,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",nc75041627,",",nc,",",nearby-cities,origin,phase-data,scitech-link,",24.0,0.008375,0.02,70.0,md,earthquake,"M 0.4 - 2 km NNW of The Geysers, CA"


In [59]:
df.dtypes

mag        float64
place       object
time         int64
updated      int64
tz          object
url         object
detail      object
felt       float64
cdi        float64
mmi        float64
alert       object
status      object
tsunami      int64
sig          int64
net         object
code        object
ids         object
sources     object
types       object
nst        float64
dmin       float64
rms        float64
gap        float64
magType     object
type        object
title       object
dtype: object

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9918 entries, 0 to 9917
Data columns (total 26 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   mag      9915 non-null   float64
 1   place    9918 non-null   object 
 2   time     9918 non-null   int64  
 3   updated  9918 non-null   int64  
 4   tz       0 non-null      object 
 5   url      9918 non-null   object 
 6   detail   9918 non-null   object 
 7   felt     897 non-null    float64
 8   cdi      897 non-null    float64
 9   mmi      114 non-null    float64
 10  alert    47 non-null     object 
 11  status   9918 non-null   object 
 12  tsunami  9918 non-null   int64  
 13  sig      9918 non-null   int64  
 14  net      9918 non-null   object 
 15  code     9918 non-null   object 
 16  ids      9918 non-null   object 
 17  sources  9918 non-null   object 
 18  types    9918 non-null   object 
 19  nst      8710 non-null   float64
 20  dmin     8707 non-null   float64
 21  rms      9908 

In [61]:
df.describe()

Unnamed: 0,mag,time,updated,felt,cdi,mmi,tsunami,sig,nst,dmin,rms,gap
count,9915.0,9918.0,9918.0,897.0,897.0,114.0,9918.0,9918.0,8710.0,8707.0,9908.0,8708.0
mean,1.559839,1723646000000.0,1723901000000.0,57.717949,2.293757,3.379035,0.000907,57.358036,25.843513,0.341005,0.246787,106.606732
std,1.123445,747269500.0,737724600.0,895.854706,1.415367,1.574974,0.030112,86.26946,21.416429,1.863213,0.245321,63.651314
min,-1.33,1722384000000.0,1722385000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0
25%,0.86,1723015000000.0,1723339000000.0,1.0,1.0,2.67625,0.0,11.0,11.0,0.01318,0.1,62.0
50%,1.4,1723622000000.0,1724090000000.0,1.0,2.2,3.5175,0.0,30.0,20.0,0.04691,0.17,88.0
75%,1.94,1724261000000.0,1724477000000.0,3.0,3.1,4.24425,0.0,58.0,34.0,0.1,0.27,134.0
max,7.1,1724976000000.0,1725122000000.0,24211.0,8.3,7.22,1.0,1099.0,330.0,112.8,2.34,350.0


In [62]:
df.describe(percentiles=[0.05, 0.95])

Unnamed: 0,mag,time,updated,felt,cdi,mmi,tsunami,sig,nst,dmin,rms,gap
count,9915.0,9918.0,9918.0,897.0,897.0,114.0,9918.0,9918.0,8710.0,8707.0,9908.0,8708.0
mean,1.559839,1723646000000.0,1723901000000.0,57.717949,2.293757,3.379035,0.000907,57.358036,25.843513,0.341005,0.246787,106.606732
std,1.123445,747269500.0,737724600.0,895.854706,1.415367,1.574974,0.030112,86.26946,21.416429,1.863213,0.245321,63.651314
min,-1.33,1722384000000.0,1722385000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0
5%,0.1,1722509000000.0,1722562000000.0,0.0,0.0,0.0,0.0,0.0,5.0,0.002896,0.02,36.0
50%,1.4,1723622000000.0,1724090000000.0,1.0,2.2,3.5175,0.0,30.0,20.0,0.04691,0.17,88.0
95%,4.3,1724858000000.0,1724954000000.0,45.0,4.8,5.8474,0.0,285.45,64.0,1.6891,0.8065,243.0
max,7.1,1724976000000.0,1725122000000.0,24211.0,8.3,7.22,1.0,1099.0,330.0,112.8,2.34,350.0


In [63]:
df.describe(include=np.object)

Unnamed: 0,place,tz,url,detail,alert,status,net,code,ids,sources,types,magType,type,title
count,9918,0.0,9918,9918,47,9918,9918,9918,9918,9918,9918,9915,9918,9918
unique,4884,0.0,9918,9918,2,2,15,9907,9918,73,57,9,6,7657
top,"6 km S of Volcano, Hawaii",,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,green,reviewed,ci,2024qylq,",pr71456968,",",ci,",",origin,phase-data,",ml,earthquake,"M 1.9 - 6 km S of Volcano, Hawaii"
freq,143,,1,1,46,6685,2250,2,1,2140,5736,6901,9737,22


In [64]:
df.describe(include='all')

Unnamed: 0,mag,place,time,updated,tz,url,detail,felt,cdi,mmi,...,ids,sources,types,nst,dmin,rms,gap,magType,type,title
count,9915.0,9918,9918.0,9918.0,0.0,9918,9918,897.0,897.0,114.0,...,9918,9918,9918,8710.0,8707.0,9908.0,8708.0,9915,9918,9918
unique,,4884,,,0.0,9918,9918,,,,...,9918,73,57,,,,,9,6,7657
top,,"6 km S of Volcano, Hawaii",,,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",pr71456968,",",ci,",",origin,phase-data,",,,,,ml,earthquake,"M 1.9 - 6 km S of Volcano, Hawaii"
freq,,143,,,,1,1,,,,...,1,2140,5736,,,,,6901,9737,22
mean,1.559839,,1723646000000.0,1723901000000.0,,,,57.717949,2.293757,3.379035,...,,,,25.843513,0.341005,0.246787,106.606732,,,
std,1.123445,,747269500.0,737724600.0,,,,895.854706,1.415367,1.574974,...,,,,21.416429,1.863213,0.245321,63.651314,,,
min,-1.33,,1722384000000.0,1722385000000.0,,,,0.0,0.0,0.0,...,,,,0.0,0.0,0.0,14.0,,,
25%,0.86,,1723015000000.0,1723339000000.0,,,,1.0,1.0,2.67625,...,,,,11.0,0.01318,0.1,62.0,,,
50%,1.4,,1723622000000.0,1724090000000.0,,,,1.0,2.2,3.5175,...,,,,20.0,0.04691,0.17,88.0,,,
75%,1.94,,1724261000000.0,1724477000000.0,,,,3.0,3.1,4.24425,...,,,,34.0,0.1,0.27,134.0,,,


In [65]:
df.felt.describe()

count      897.000000
mean        57.717949
std        895.854706
min          0.000000
25%          1.000000
50%          1.000000
75%          3.000000
max      24211.000000
Name: felt, dtype: float64

In [66]:
df.alert.unique()

array([None, 'green', 'yellow'], dtype=object)

In [67]:
df.alert.value_counts()

green     46
yellow     1
Name: alert, dtype: int64

In [None]:
#The previous cells delt with inspecting data in a DataFrame using simple DataFrame functions that allow powerful calculations with low overhead.

In [68]:
import pandas as pd

df = pd.read_csv('data/earthquakes.csv')

In [69]:
df.mag

0       1.35
1       1.29
2       3.42
3       0.44
4       2.16
        ... 
9327    0.62
9328    1.00
9329    2.40
9330    1.10
9331    0.66
Name: mag, Length: 9332, dtype: float64

In [70]:
df['mag']

0       1.35
1       1.29
2       3.42
3       0.44
4       2.16
        ... 
9327    0.62
9328    1.00
9329    2.40
9330    1.10
9331    0.66
Name: mag, Length: 9332, dtype: float64

In [71]:
df[['mag', 'title']]

Unnamed: 0,mag,title
0,1.35,"M 1.4 - 9km NE of Aguanga, CA"
1,1.29,"M 1.3 - 9km NE of Aguanga, CA"
2,3.42,"M 3.4 - 8km NE of Aguanga, CA"
3,0.44,"M 0.4 - 9km NE of Aguanga, CA"
4,2.16,"M 2.2 - 10km NW of Avenal, CA"
...,...,...
9327,0.62,"M 0.6 - 9km ENE of Mammoth Lakes, CA"
9328,1.00,"M 1.0 - 3km W of Julian, CA"
9329,2.40,"M 2.4 - 35km NNE of Hatillo, Puerto Rico"
9330,1.10,"M 1.1 - 9km NE of Aguanga, CA"


In [72]:
df[
    ['title', 'time']
    + [col for col in df.columns if col.startswith('mag')]
]

Unnamed: 0,title,time,mag,magType
0,"M 1.4 - 9km NE of Aguanga, CA",1539475168010,1.35,ml
1,"M 1.3 - 9km NE of Aguanga, CA",1539475129610,1.29,ml
2,"M 3.4 - 8km NE of Aguanga, CA",1539475062610,3.42,ml
3,"M 0.4 - 9km NE of Aguanga, CA",1539474978070,0.44,ml
4,"M 2.2 - 10km NW of Avenal, CA",1539474716050,2.16,md
...,...,...,...,...
9327,"M 0.6 - 9km ENE of Mammoth Lakes, CA",1537230228060,0.62,md
9328,"M 1.0 - 3km W of Julian, CA",1537230135130,1.00,ml
9329,"M 2.4 - 35km NNE of Hatillo, Puerto Rico",1537229908180,2.40,md
9330,"M 1.1 - 9km NE of Aguanga, CA",1537229545350,1.10,ml


In [73]:
[col for col in df.columns if col.startswith('mag')]

['mag', 'magType']

In [74]:
['title', 'time'] \
+ [col for col in df.columns if col.startswith('mag')]

['title', 'time', 'mag', 'magType']

In [75]:
df[
    ['title', 'time']
    + [col for col in df.columns if col.startswith('mag')]
]

Unnamed: 0,title,time,mag,magType
0,"M 1.4 - 9km NE of Aguanga, CA",1539475168010,1.35,ml
1,"M 1.3 - 9km NE of Aguanga, CA",1539475129610,1.29,ml
2,"M 3.4 - 8km NE of Aguanga, CA",1539475062610,3.42,ml
3,"M 0.4 - 9km NE of Aguanga, CA",1539474978070,0.44,ml
4,"M 2.2 - 10km NW of Avenal, CA",1539474716050,2.16,md
...,...,...,...,...
9327,"M 0.6 - 9km ENE of Mammoth Lakes, CA",1537230228060,0.62,md
9328,"M 1.0 - 3km W of Julian, CA",1537230135130,1.00,ml
9329,"M 2.4 - 35km NNE of Hatillo, Puerto Rico",1537229908180,2.40,md
9330,"M 1.1 - 9km NE of Aguanga, CA",1537229545350,1.10,ml


In [76]:
df[100:103]

Unnamed: 0,alert,cdi,code,detail,dmin,felt,gap,ids,mag,magType,...,sources,status,time,title,tsunami,type,types,tz,updated,url
100,,,20280310,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,",ak20280310,",1.2,ml,...,",ak,",automatic,1539435449480,"M 1.2 - 25km NW of Ester, Alaska",0,earthquake,",geoserve,origin,",-540.0,1539443551010,https://earthquake.usgs.gov/earthquakes/eventp...
101,,,73096756,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.01355,,185.0,",nc73096756,",0.59,md,...,",nc,",automatic,1539435391320,"M 0.6 - 8km ESE of Mammoth Lakes, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,scit...",-480.0,1539439802162,https://earthquake.usgs.gov/earthquakes/eventp...
102,,,37388730,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.02987,,39.0,",ci37388730,",1.33,ml,...,",ci,",automatic,1539435293090,"M 1.3 - 8km ENE of Aguanga, CA",0,earthquake,",focal-mechanism,geoserve,nearby-cities,origin...",-480.0,1539435940470,https://earthquake.usgs.gov/earthquakes/eventp...


In [77]:
df[['title', 'time']][100:103]

Unnamed: 0,title,time
100,"M 1.2 - 25km NW of Ester, Alaska",1539435449480
101,"M 0.6 - 8km ESE of Mammoth Lakes, CA",1539435391320
102,"M 1.3 - 8km ENE of Aguanga, CA",1539435293090


In [78]:
df[100:103][['title', 'time']].equals(
    df[['title', 'time']][100:103]
)

True

In [79]:
df[110:113]['title'] = df[110:113]['title'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[110:113]['title'] = df[110:113]['title'].str.lower()


In [80]:
df[110:113]['title']

110               m 1.1 - 35km s of ester, alaska
111    m 1.9 - 93km wnw of arctic village, alaska
112      m 0.9 - 20km wsw of smith valley, nevada
Name: title, dtype: object

In [81]:
df.loc[110:112, 'title'] = df.loc[110:112, 'title'].str.lower()
df.loc[110:112, 'title']

110               m 1.1 - 35km s of ester, alaska
111    m 1.9 - 93km wnw of arctic village, alaska
112      m 0.9 - 20km wsw of smith valley, nevada
Name: title, dtype: object

In [82]:
df.loc[:,'title']

0                  M 1.4 - 9km NE of Aguanga, CA
1                  M 1.3 - 9km NE of Aguanga, CA
2                  M 3.4 - 8km NE of Aguanga, CA
3                  M 0.4 - 9km NE of Aguanga, CA
4                  M 2.2 - 10km NW of Avenal, CA
                          ...                   
9327        M 0.6 - 9km ENE of Mammoth Lakes, CA
9328                 M 1.0 - 3km W of Julian, CA
9329    M 2.4 - 35km NNE of Hatillo, Puerto Rico
9330               M 1.1 - 9km NE of Aguanga, CA
9331               M 0.7 - 9km NE of Aguanga, CA
Name: title, Length: 9332, dtype: object

In [83]:
df.loc[10:15, ['title', 'mag']]

Unnamed: 0,title,mag
10,"M 0.5 - 10km NE of Aguanga, CA",0.5
11,"M 2.8 - 53km SE of Punta Cana, Dominican Republic",2.77
12,"M 0.5 - 9km NE of Aguanga, CA",0.5
13,"M 4.5 - 120km SSW of Banda Aceh, Indonesia",4.5
14,"M 2.1 - 14km NW of Parkfield, CA",2.13
15,"M 2.0 - 156km WNW of Haines Junction, Canada",2.0


In [84]:
df.iloc[10:15, [19, 8]]

Unnamed: 0,title,mag
10,"M 0.5 - 10km NE of Aguanga, CA",0.5
11,"M 2.8 - 53km SE of Punta Cana, Dominican Republic",2.77
12,"M 0.5 - 9km NE of Aguanga, CA",0.5
13,"M 4.5 - 120km SSW of Banda Aceh, Indonesia",4.5
14,"M 2.1 - 14km NW of Parkfield, CA",2.13


In [85]:
df.iloc[10:15, 6:10]

Unnamed: 0,gap,ids,mag,magType
10,57.0,",ci37389162,",0.5,ml
11,186.0,",pr2018286010,",2.77,md
12,76.0,",ci37389146,",0.5,ml
13,157.0,",us1000hbti,",4.5,mb
14,71.0,",nc73096921,",2.13,md


In [86]:
df.iloc[10:15, 6:10].equals(
    df.loc[10:14, 'gap':'magType']
)

True

In [87]:
df.at[10, 'mag']

0.5

In [88]:
df.iat[10, 8]

0.5

In [89]:
df.mag > 2

0       False
1       False
2        True
3       False
4        True
        ...  
9327    False
9328    False
9329     True
9330    False
9331    False
Name: mag, Length: 9332, dtype: bool

In [90]:
df[df.mag >= 7.0]

Unnamed: 0,alert,cdi,code,detail,dmin,felt,gap,ids,mag,magType,...,sources,status,time,title,tsunami,type,types,tz,updated,url
837,green,4.1,1000haa3,https://earthquake.usgs.gov/fdsnws/event/1/que...,1.763,3.0,14.0,",us1000haa3,pt18283003,at00pgehsk,",7.0,mww,...,",us,pt,at,",reviewed,1539204500290,"M 7.0 - 117km E of Kimbe, Papua New Guinea",1,earthquake,",dyfi,finite-fault,general-text,geoserve,groun...",600.0,1539378744253,https://earthquake.usgs.gov/earthquakes/eventp...
5263,red,8.4,1000h3p4,https://earthquake.usgs.gov/fdsnws/event/1/que...,1.589,18.0,27.0,",us1000h3p4,us1000h4p4,",7.5,mww,...,",us,us,",reviewed,1538128963480,"M 7.5 - 78km N of Palu, Indonesia",1,earthquake,",dyfi,finite-fault,general-text,geoserve,groun...",480.0,1539123134531,https://earthquake.usgs.gov/earthquakes/eventp...


In [91]:
df.loc[
    df.mag >= 7.0,
    ['alert', 'mag', 'magType', 'title', 'tsunami', 'type']
]

Unnamed: 0,alert,mag,magType,title,tsunami,type
837,green,7.0,mww,"M 7.0 - 117km E of Kimbe, Papua New Guinea",1,earthquake
5263,red,7.5,mww,"M 7.5 - 78km N of Palu, Indonesia",1,earthquake


In [92]:
df.loc[
    (df.tsunami == 1) & (df.alert == 'red'),
    ['alert', 'mag', 'magType', 'title', 'tsunami', 'type']
]

Unnamed: 0,alert,mag,magType,title,tsunami,type
5263,red,7.5,mww,"M 7.5 - 78km N of Palu, Indonesia",1,earthquake


In [93]:
df.loc[
    (df.tsunami == 1) | (df.alert == 'red'),
    ['alert', 'mag', 'magType', 'title', 'tsunami', 'type']
]

Unnamed: 0,alert,mag,magType,title,tsunami,type
36,,5.0,mww,"M 5.0 - 165km NNW of Flying Fish Cove, Christm...",1,earthquake
118,green,6.7,mww,"M 6.7 - 262km NW of Ozernovskiy, Russia",1,earthquake
501,green,5.6,mww,"M 5.6 - 128km SE of Kimbe, Papua New Guinea",1,earthquake
799,green,6.5,mww,"M 6.5 - 148km S of Severo-Kuril'sk, Russia",1,earthquake
816,green,6.2,mww,"M 6.2 - 94km SW of Kokopo, Papua New Guinea",1,earthquake
...,...,...,...,...,...,...
8561,,5.4,mb,"M 5.4 - 228km S of Taron, Papua New Guinea",1,earthquake
8624,,5.1,mb,"M 5.1 - 278km SE of Pondaguitan, Philippines",1,earthquake
9133,green,5.1,ml,"M 5.1 - 64km SSW of Kaktovik, Alaska",1,earthquake
9175,,5.2,mb,"M 5.2 - 126km N of Dili, East Timor",1,earthquake


In [94]:
df.loc[
    (df.place.str.contains('Alaska')) & (df.alert.notnull()),
    ['alert', 'mag', 'magType', 'title', 'tsunami', 'type']
]

Unnamed: 0,alert,mag,magType,title,tsunami,type
1015,green,5.0,ml,"M 5.0 - 61km SSW of Chignik Lake, Alaska",1,earthquake
1273,green,4.0,ml,"M 4.0 - 71km SW of Kaktovik, Alaska",1,earthquake
1795,green,4.0,ml,"M 4.0 - 60km WNW of Valdez, Alaska",1,earthquake
2752,green,4.0,ml,"M 4.0 - 67km SSW of Kaktovik, Alaska",1,earthquake
3260,green,3.9,ml,"M 3.9 - 44km N of North Nenana, Alaska",0,earthquake
4101,green,4.2,ml,"M 4.2 - 131km NNW of Arctic Village, Alaska",0,earthquake
6897,green,3.8,ml,"M 3.8 - 80km SSW of Kaktovik, Alaska",0,earthquake
8524,green,3.8,ml,"M 3.8 - 69km SSW of Kaktovik, Alaska",0,earthquake
9133,green,5.1,ml,"M 5.1 - 64km SSW of Kaktovik, Alaska",1,earthquake


In [95]:
df.loc[
    (df.place.str.contains(r'CA|California$')) & (df.mag > 3.8),
    ['alert', 'mag', 'magType', 'title', 'tsunami', 'type']
]

Unnamed: 0,alert,mag,magType,title,tsunami,type
1465,green,3.83,mw,"M 3.8 - 109km WNW of Trinidad, CA",0,earthquake
2414,green,3.83,mw,"M 3.8 - 5km SW of Tres Pinos, CA",1,earthquake


In [96]:
df.loc[
    df.mag.between(6.5, 7.5),
    ['alert', 'mag', 'magType', 'title', 'tsunami', 'type']
]

Unnamed: 0,alert,mag,magType,title,tsunami,type
118,green,6.7,mww,"M 6.7 - 262km NW of Ozernovskiy, Russia",1,earthquake
799,green,6.5,mww,"M 6.5 - 148km S of Severo-Kuril'sk, Russia",1,earthquake
837,green,7.0,mww,"M 7.0 - 117km E of Kimbe, Papua New Guinea",1,earthquake
4363,green,6.7,mww,"M 6.7 - 263km NNE of Ndoi Island, Fiji",1,earthquake
5263,red,7.5,mww,"M 7.5 - 78km N of Palu, Indonesia",1,earthquake


In [97]:
df.loc[
    df.magType.isin(['mw', 'mwb']),
    ['alert', 'mag', 'magType', 'title', 'tsunami', 'type']
]

Unnamed: 0,alert,mag,magType,title,tsunami,type
995,,3.35,mw,"M 3.4 - 9km WNW of Cobb, CA",0,earthquake
1465,green,3.83,mw,"M 3.8 - 109km WNW of Trinidad, CA",0,earthquake
2414,green,3.83,mw,"M 3.8 - 5km SW of Tres Pinos, CA",1,earthquake
4988,green,4.41,mw,"M 4.4 - 1km SE of Delta, B.C., MX",1,earthquake
6307,green,5.8,mwb,"M 5.8 - 297km NNE of Ndoi Island, Fiji",0,earthquake
8257,green,5.7,mwb,"M 5.7 - 175km SSE of Lambasa, Fiji",0,earthquake


In [98]:
[df.mag.idxmin(), df.mag.idxmax()]

[2409, 5263]

In [99]:
df.loc[
    [df.mag.idxmin(), df.mag.idxmax()],
    ['alert', 'mag', 'magType', 'title', 'tsunami', 'type']
]

Unnamed: 0,alert,mag,magType,title,tsunami,type
2409,,-1.26,ml,"M -1.3 - 41km ENE of Adak, Alaska",0,earthquake
5263,red,7.5,mww,"M 7.5 - 78km N of Palu, Indonesia",1,earthquake


In [100]:
df.filter(items=['mag', 'magType']).head()

Unnamed: 0,mag,magType
0,1.35,ml
1,1.29,ml
2,3.42,ml
3,0.44,ml
4,2.16,md


In [101]:
df.filter(like='mag').head()

Unnamed: 0,mag,magType
0,1.35,ml
1,1.29,ml
2,3.42,ml
3,0.44,ml
4,2.16,md


In [102]:
df.filter(regex=r'^t').head()

Unnamed: 0,time,title,tsunami,type,types,tz
0,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0
1,1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0
2,1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,earthquake,",dyfi,focal-mechanism,geoserve,nearby-cities,o...",-480.0
3,1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0
4,1539474716050,"M 2.2 - 10km NW of Avenal, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,scit...",-480.0


In [103]:
df.set_index('place').filter(like='Japan', axis=0).filter(items=['mag', 'magType', 'title']).head()

Unnamed: 0_level_0,mag,magType,title
place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"160km NNW of Nago, Japan",4.6,mb,"M 4.6 - 160km NNW of Nago, Japan"
"7km ESE of Asahi, Japan",5.2,mww,"M 5.2 - 7km ESE of Asahi, Japan"
"14km E of Tomakomai, Japan",4.5,mwr,"M 4.5 - 14km E of Tomakomai, Japan"
"139km WSW of Naze, Japan",4.7,mb,"M 4.7 - 139km WSW of Naze, Japan"
"53km ESE of Kamaishi, Japan",4.6,mb,"M 4.6 - 53km ESE of Kamaishi, Japan"


In [104]:
df.set_index('place').title.filter(like='Japan').head()

place
160km NNW of Nago, Japan          M 4.6 - 160km NNW of Nago, Japan
7km ESE of Asahi, Japan            M 5.2 - 7km ESE of Asahi, Japan
14km E of Tomakomai, Japan      M 4.5 - 14km E of Tomakomai, Japan
139km WSW of Naze, Japan          M 4.7 - 139km WSW of Naze, Japan
53km ESE of Kamaishi, Japan    M 4.6 - 53km ESE of Kamaishi, Japan
Name: title, dtype: object

In [None]:
#The previous cells illustrate ow to subset data using cvommands from the DataFrames library in Pandas. There are multiple ways to get the same data.

In [105]:
import pandas as pd

df = pd.read_csv(
    'data/earthquakes.csv', 
    usecols=['time', 'title', 'place', 'magType', 'mag', 'alert', 'tsunami']
)

In [106]:
df['source'] = 'USGS API'
df.head()

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API
2,,3.42,ml,"8km NE of Aguanga, CA",1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,USGS API
3,,0.44,ml,"9km NE of Aguanga, CA",1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,USGS API
4,,2.16,md,"10km NW of Avenal, CA",1539474716050,"M 2.2 - 10km NW of Avenal, CA",0,USGS API


In [107]:
df['mag_negative'] = df.mag < 0
df.head()

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API,False
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API,False
2,,3.42,ml,"8km NE of Aguanga, CA",1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,USGS API,False
3,,0.44,ml,"9km NE of Aguanga, CA",1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,USGS API,False
4,,2.16,md,"10km NW of Avenal, CA",1539474716050,"M 2.2 - 10km NW of Avenal, CA",0,USGS API,False


In [108]:
df.place.str.extract(r', (.*$)')[0].sort_values().unique()

array(['Afghanistan', 'Alaska', 'Argentina', 'Arizona', 'Arkansas',
       'Australia', 'Azerbaijan', 'B.C., MX', 'Barbuda', 'Bolivia',
       'Bonaire, Saint Eustatius and Saba ', 'British Virgin Islands',
       'Burma', 'CA', 'California', 'Canada', 'Chile', 'China',
       'Christmas Island', 'Colombia', 'Colorado', 'Costa Rica',
       'Dominican Republic', 'East Timor', 'Ecuador', 'Ecuador region',
       'El Salvador', 'Fiji', 'Greece', 'Greenland', 'Guam', 'Guatemala',
       'Haiti', 'Hawaii', 'Honduras', 'Idaho', 'Illinois', 'India',
       'Indonesia', 'Iran', 'Iraq', 'Italy', 'Jamaica', 'Japan', 'Kansas',
       'Kentucky', 'Kyrgyzstan', 'Martinique', 'Mauritius', 'Mayotte',
       'Mexico', 'Missouri', 'Montana', 'NV', 'Nevada', 'New Caledonia',
       'New Hampshire', 'New Mexico', 'New Zealand', 'Nicaragua',
       'North Carolina', 'Northern Mariana Islands', 'Oklahoma', 'Oregon',
       'Pakistan', 'Papua New Guinea', 'Peru', 'Philippines',
       'Puerto Rico', 'Roman

In [109]:
df['parsed_place'] = df.place.str.replace(
    r'.* of ', '', regex=True # remove anything saying <something> of <something>
).str.replace(
    'the ', '' # remove "the "
).str.replace(
    r'CA$', 'California', regex=True # fix California
).str.replace(
    r'NV$', 'Nevada', regex=True # fix Nevada
).str.replace(
    r'MX$', 'Mexico', regex=True # fix Mexico
).str.replace(
    r' region$', '', regex=True # chop off endings with " region"
).str.replace(
    'northern ', '' # remove "northern "
).str.replace(
    'Fiji Islands', 'Fiji' # line up the Fiji places
).str.replace(
    r'^.*, ', '', regex=True # remove anything else extraneous from the beginning
).str.strip() # remove any extra spaces

In [110]:
df.parsed_place.sort_values().unique()

array(['Afghanistan', 'Alaska', 'Argentina', 'Arizona', 'Arkansas',
       'Ascension Island', 'Australia', 'Azerbaijan', 'Balleny Islands',
       'Barbuda', 'Bolivia', 'British Virgin Islands', 'Burma',
       'California', 'Canada', 'Carlsberg Ridge',
       'Central East Pacific Rise', 'Central Mid-Atlantic Ridge', 'Chile',
       'China', 'Christmas Island', 'Colombia', 'Colorado', 'Costa Rica',
       'Dominican Republic', 'East Timor', 'Ecuador', 'El Salvador',
       'Fiji', 'Greece', 'Greenland', 'Guam', 'Guatemala', 'Haiti',
       'Hawaii', 'Honduras', 'Idaho', 'Illinois', 'India',
       'Indian Ocean Triple Junction', 'Indonesia', 'Iran', 'Iraq',
       'Italy', 'Jamaica', 'Japan', 'Kansas', 'Kentucky',
       'Kermadec Islands', 'Kuril Islands', 'Kyrgyzstan', 'Martinique',
       'Mauritius', 'Mayotte', 'Mexico', 'Mid-Indian Ridge', 'Missouri',
       'Montana', 'Nevada', 'New Caledonia', 'New Hampshire',
       'New Mexico', 'New Zealand', 'Nicaragua', 'North Carolina',


In [111]:
df.assign(
    in_ca=df.parsed_place.str.endswith('California'),
    in_alaska=df.parsed_place.str.endswith('Alaska')
).sample(5, random_state=0)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parsed_place,in_ca,in_alaska
7207,,4.8,mwr,"73km SSW of Masachapa, Nicaragua",1537749595210,"M 4.8 - 73km SSW of Masachapa, Nicaragua",0,USGS API,False,Nicaragua,False,False
4755,,1.09,ml,"28km NNW of Packwood, Washington",1538227540460,"M 1.1 - 28km NNW of Packwood, Washington",0,USGS API,False,Washington,False,False
4595,,1.8,ml,"77km SSW of Kaktovik, Alaska",1538259609862,"M 1.8 - 77km SSW of Kaktovik, Alaska",0,USGS API,False,Alaska,False,True
3566,,1.5,ml,"102km NW of Arctic Village, Alaska",1538464751822,"M 1.5 - 102km NW of Arctic Village, Alaska",0,USGS API,False,Alaska,False,True
2182,,0.9,ml,"26km ENE of Pine Valley, CA",1538801713880,"M 0.9 - 26km ENE of Pine Valley, CA",0,USGS API,False,California,True,False


In [112]:
df.assign(
    in_ca=df.parsed_place == 'California',
    in_alaska=df.parsed_place == 'Alaska',
    neither=lambda x: ~x.in_ca & ~x.in_alaska
).sample(5, random_state=0)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parsed_place,in_ca,in_alaska,neither
7207,,4.8,mwr,"73km SSW of Masachapa, Nicaragua",1537749595210,"M 4.8 - 73km SSW of Masachapa, Nicaragua",0,USGS API,False,Nicaragua,False,False,True
4755,,1.09,ml,"28km NNW of Packwood, Washington",1538227540460,"M 1.1 - 28km NNW of Packwood, Washington",0,USGS API,False,Washington,False,False,True
4595,,1.8,ml,"77km SSW of Kaktovik, Alaska",1538259609862,"M 1.8 - 77km SSW of Kaktovik, Alaska",0,USGS API,False,Alaska,False,True,False
3566,,1.5,ml,"102km NW of Arctic Village, Alaska",1538464751822,"M 1.5 - 102km NW of Arctic Village, Alaska",0,USGS API,False,Alaska,False,True,False
2182,,0.9,ml,"26km ENE of Pine Valley, CA",1538801713880,"M 0.9 - 26km ENE of Pine Valley, CA",0,USGS API,False,California,True,False,False


In [113]:
tsunami = df[df.tsunami == 1]
no_tsunami = df[df.tsunami == 0]

tsunami.shape, no_tsunami.shape

((61, 10), (9271, 10))

In [114]:
pd.concat([tsunami, no_tsunami]).shape

(9332, 10)

In [115]:
tsunami.append(no_tsunami).shape

(9332, 10)

In [116]:
additional_columns = pd.read_csv(
    'data/earthquakes.csv', usecols=['tz', 'felt', 'ids']
)
pd.concat([df.head(2), additional_columns.head(2)], axis=1)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parsed_place,felt,ids,tz
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API,False,California,,",ci37389218,",-480.0
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API,False,California,,",ci37389202,",-480.0


In [117]:
additional_columns = pd.read_csv(
    'data/earthquakes.csv', usecols=['tz', 'felt', 'ids', 'time'], index_col='time'
)
pd.concat([df.head(2), additional_columns.head(2)], axis=1)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parsed_place,felt,ids,tz
0,,1.35,ml,"9km NE of Aguanga, CA",1539475000000.0,"M 1.4 - 9km NE of Aguanga, CA",0.0,USGS API,False,California,,,
1,,1.29,ml,"9km NE of Aguanga, CA",1539475000000.0,"M 1.3 - 9km NE of Aguanga, CA",0.0,USGS API,False,California,,,
1539475129610,,,,,,,,,,,,",ci37389202,",-480.0
1539475168010,,,,,,,,,,,,",ci37389218,",-480.0


In [118]:
pd.concat(
    [tsunami.head(2), no_tsunami.head(2).assign(type='earthquake')], join='inner'
)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parsed_place
36,,5.0,mww,"165km NNW of Flying Fish Cove, Christmas Island",1539459504090,"M 5.0 - 165km NNW of Flying Fish Cove, Christm...",1,USGS API,False,Christmas Island
118,green,6.7,mww,"262km NW of Ozernovskiy, Russia",1539429023560,"M 6.7 - 262km NW of Ozernovskiy, Russia",1,USGS API,False,Russia
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API,False,California
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API,False,California


In [119]:
pd.concat(
    [tsunami.head(2), no_tsunami.head(2).assign(type='earthquake')], join='inner', ignore_index=True
)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parsed_place
0,,5.0,mww,"165km NNW of Flying Fish Cove, Christmas Island",1539459504090,"M 5.0 - 165km NNW of Flying Fish Cove, Christm...",1,USGS API,False,Christmas Island
1,green,6.7,mww,"262km NW of Ozernovskiy, Russia",1539429023560,"M 6.7 - 262km NW of Ozernovskiy, Russia",1,USGS API,False,Russia
2,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API,False,California
3,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API,False,California


In [120]:
del df['source']
df.columns

Index(['alert', 'mag', 'magType', 'place', 'time', 'title', 'tsunami',
       'mag_negative', 'parsed_place'],
      dtype='object')

In [121]:
try:
    del df['source']
except KeyError:
    # handle the error here
    print('not there anymore')

not there anymore


In [122]:
mag_negative = df.pop('mag_negative')
df.columns

Index(['alert', 'mag', 'magType', 'place', 'time', 'title', 'tsunami',
       'parsed_place'],
      dtype='object')

In [123]:
mag_negative.value_counts()

False    8841
True      491
Name: mag_negative, dtype: int64

In [124]:
df[mag_negative].head()

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,parsed_place
39,,-0.1,ml,"6km NW of Lemmon Valley, Nevada",1539458844506,"M -0.1 - 6km NW of Lemmon Valley, Nevada",0,Nevada
49,,-0.1,ml,"6km NW of Lemmon Valley, Nevada",1539455017464,"M -0.1 - 6km NW of Lemmon Valley, Nevada",0,Nevada
135,,-0.4,ml,"10km SSE of Beatty, Nevada",1539422175717,"M -0.4 - 10km SSE of Beatty, Nevada",0,Nevada
161,,-0.02,md,"20km SSE of Ronan, Montana",1539412475360,"M -0.0 - 20km SSE of Ronan, Montana",0,Montana
198,,-0.2,ml,"60km N of Pahrump, Nevada",1539398340822,"M -0.2 - 60km N of Pahrump, Nevada",0,Nevada


In [125]:
df.drop([0, 1]).head(2)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,parsed_place
2,,3.42,ml,"8km NE of Aguanga, CA",1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,California
3,,0.44,ml,"9km NE of Aguanga, CA",1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,California


In [134]:
cols_to_drop = [
    col for col in df.columns
    if col not in ['alert', 'mag', 'title', 'time', 'tsunami']
]
df.drop(columns=cols_to_drop).head()

Unnamed: 0,alert,mag,time,title,tsunami
0,,1.35,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0
1,,1.29,1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0
2,,3.42,1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0
3,,0.44,1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0
4,,2.16,1539474716050,"M 2.2 - 10km NW of Avenal, CA",0


In [135]:
df.drop(columns=cols_to_drop).equals(
    df.drop(cols_to_drop, axis=1)
)

True

In [136]:
df.drop(columns=cols_to_drop, inplace=True)
df.head()

Unnamed: 0,alert,mag,time,title,tsunami
0,,1.35,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0
1,,1.29,1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0
2,,3.42,1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0
3,,0.44,1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0
4,,2.16,1539474716050,"M 2.2 - 10km NW of Avenal, CA",0


In [None]:
#The previous cells dealt with adding and removing data from DataFrames especially when the data may not line up exactly and how to deal with that.