# Importing and inspecting data
v.ekc

In [1]:
import numpy as np
import pandas as pd

## Import data

In [2]:
# read a csv in your working directory
df = pd.read_csv('earthquakes.csv')
df.head(2)

Unnamed: 0,alert,cdi,code,detail,dmin,felt,gap,ids,mag,magType,...,sources,status,time,title,tsunami,type,types,tz,updated,url
0,,,37389218,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.008693,,85.0,",ci37389218,",1.35,ml,...,",ci,",automatic,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475395144,https://earthquake.usgs.gov/earthquakes/eventp...
1,,,37389202,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.02003,,79.0,",ci37389202,",1.29,ml,...,",ci,",automatic,1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475253925,https://earthquake.usgs.gov/earthquakes/eventp...


In [3]:
# read a csv online
df = pd.read_csv('https://github.com/bethanyj0/data271_sp24/blob/main/demos/earthquakes.csv?raw=True')
df.head(2)

Unnamed: 0,alert,cdi,code,detail,dmin,felt,gap,ids,mag,magType,...,sources,status,time,title,tsunami,type,types,tz,updated,url
0,,,37389218,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.008693,,85.0,",ci37389218,",1.35,ml,...,",ci,",automatic,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475395144,https://earthquake.usgs.gov/earthquakes/eventp...
1,,,37389202,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.02003,,79.0,",ci37389202,",1.29,ml,...,",ci,",automatic,1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475253925,https://earthquake.usgs.gov/earthquakes/eventp...


In [4]:
# read in an excel file
df = pd.read_excel('earthquakes.xlsx',sheet_name = 'earthquakes')
df.head(2)

ImportError: `Import openpyxl` failed.  Use pip or conda to install the openpyxl package.

In [5]:
# Read in an excel file online
df = pd.read_excel('https://github.com/bethanyj0/data271_sp24/blob/main/demos/earthquakes.xlsx?raw=True',sheet_name = 'earthquakes')
df.head(2)

ImportError: `Import openpyxl` failed.  Use pip or conda to install the openpyxl package.

In [6]:
# To use one of the columns as your row index
df = pd.read_csv('earthquakes.csv',index_col='code')
df.head(2)

Unnamed: 0_level_0,alert,cdi,detail,dmin,felt,gap,ids,mag,magType,mmi,...,sources,status,time,title,tsunami,type,types,tz,updated,url
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
37389218,,,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.008693,,85.0,",ci37389218,",1.35,ml,,...,",ci,",automatic,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475395144,https://earthquake.usgs.gov/earthquakes/eventp...
37389202,,,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.02003,,79.0,",ci37389202,",1.29,ml,,...,",ci,",automatic,1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475253925,https://earthquake.usgs.gov/earthquakes/eventp...


In [7]:
df.index.name

'code'

### Initial Inspection of the Data

In [None]:
# Is the data frame empty? Did import fail?
df.empty

In [None]:
# display the top few rows
df.head(3)

In [None]:
# inspecting the last three rows
df.tail(3)

In [None]:
# info() gives more information, including the number of non-nulls
df.info()

## Pandas Methods

In [None]:
# To reset the index
df.reset_index().head(2)

In [None]:
# Doesn't update the original
df.head(2)

In [None]:
# Reset the index in the original
df.reset_index(inplace=True)
df.head(2)

In [None]:
# obtain summary statistics for numeric columns
df.describe()

In [None]:
# if we would like to just describe one column, such as mag (magnitude)
df.mag.describe()

In [None]:
# we can look for unique values in a column
df.status.unique()

In [None]:
# Get the number of rows in each category
df.status.value_counts()

In [None]:
# Get the number of non-null values in a column
df.felt.count()

In [None]:
# mean of a column
df.mag.mean()

In [None]:
# median
df.mag.median()

In [None]:
# quantile
df.mag.quantile(0.5)

In [None]:
# sum of a column
df.mag.sum()

In [None]:
# min of a column
df.mag.min()

In [None]:
# max of a column
df.mag.max()

In [None]:
# POSITION of maximum (can also use min)
df.mag.argmax()

In [None]:
# INDEX LABEL of maximum (can also use min)
df.mag.idxmax()

In [None]:
# Sort values in a series
df.mag.sort_values()

In [None]:
# Sort values rows in a dataframe by a value
df.sort_values(by='mag')

In [None]:
# Certain numeric methods won't automatically work on dataframes
#df.max()

In [None]:
# You can do multiple columns at once if all numeric
df.loc[:,['mag','gap']].max()

In [None]:
# Get the average of one column based on another column 
df.groupby('status')['mag'].mean()

In [None]:
# Get the average of multiple columns based on another column 
df.groupby('status')[['mag','gap']].mean()

### Selecting subsets

In [None]:
# select all columns with object datatypes
df.select_dtypes(object)

In [None]:
# select all columns with ints
df.select_dtypes(int)

In [None]:
# select all columns with numeric datatypes
df.select_dtypes('number')

### Filtering DataFrames with boolean indexing

In [None]:
# keep only the rows where this boolean statement is true (mag greater than or equal to 7)
df[df.mag >= 7]

In [None]:
# important columns for earthquakes with magnitude greater than or equal to 7 OR caused a tsunami
df.loc[
    (df.tsunami == 1) | (df.mag >= 7),
    ['mag', 'title', 'tsunami', 'place']
].head(3)

In [None]:
# Just get the earthquakes in California
df.loc[
    (df.place.str.contains('California')),
    ['mag', 'title', 'tsunami', 'place']
]

In [None]:
# We might have missed some-- the USGS has tagged some locations as California and some as CA.
cali_df = df.loc[
    (df.place.str.contains('CA|California')),
    ['mag', 'title', 'tsunami', 'place']
]
cali_df.head(3)

In [None]:
# if we just want the columns related to magnitude
df.loc[
    (df.place.str.contains('CA|California')),
    [col for col in df.columns if 'mag' in col]
].head(3)

## Activity 
Create a DataFrame with two rows and 2 columns. The columns should be `mag` and `place`. The first row should contain the information for the smallest earthquake in California (lowest magnitude) and the second row should contain information for the largest earthquake) in California.

In [None]:
# This allows us to index with loc
cali_df.loc[
    [cali_df.mag.idxmin(), cali_df.mag.idxmax()],
    ['mag','place']
]

How many earthquakes in the dataset had a red alert?

In [None]:
df.alert.value_counts()['red']

How many Oregon earthquakes are in the dataset?

In [None]:
or_df = df.loc[
    (df.place.str.contains('OR|Oregon')),
    ['mag', 'title', 'tsunami', 'place']
]
or_df.shape