# More Pandas
If you want to type along with me, use [this notebook](https://humboldt.cloudbank.2i2c.cloud/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2Fbethanyj0%2Fdata271_sp25&branch=main&urlpath=tree%2Fdata271_sp25%2Flectures%2Fdata271_lec14_live.ipynb) instead. 
If you don't want to type and want to follow along just by executing the cells, stay in this notebook.

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

### Import data

In [None]:
# read a csv in your working directory
df = pd.read_csv('earthquakes.csv')
df.head(2)

### Selecting subsets based on datatype

In [None]:
# select all columns with object datatypes
df.select_dtypes(object)

In [None]:
# select all columns with ints
df.select_dtypes(int)

In [None]:
# select all columns with numeric datatypes
df.select_dtypes('number')

### Filtering rows or columns with boolean indexing

In [None]:
# tempting syntax, but doesn't work
#df['mag' >= 7]

In [None]:
# keep only the rows where this boolean statement is true (mag greater than or equal to 7)
df[df['mag'] >= 7]

In [None]:
# Supports multiple conditions with & or |
# select some columns for earthquakes with magnitude greater than or equal to 7 OR caused a tsunami
df.loc[
    (df.tsunami == 1) | (df.mag >= 7),
    ['mag', 'title', 'tsunami', 'place']
].head(3)

In [None]:
# Use str.contains to filter strings
df.loc[
    (df.place.str.contains('California')),
    ['mag', 'title', 'tsunami', 'place']
].shape

In [None]:
# Also supports regex -- the USGS has tagged some locations as California and some as CA.
cali_df = df.loc[
    (df.place.str.contains('CA|California', regex=True)),
    ['mag', 'title', 'tsunami', 'place']
]
cali_df.shape

In [None]:
# Can use list comprehensions to help filtering
df.loc[
    (df.place.str.contains('CA|California', regex=True)),
    [col for col in df.columns if 'mag' in col]
].head(3)

In [None]:
# another way
df.loc[
    (df.place.str.contains('CA|California', regex=True)),
    df.columns.str.contains('.*mag.*',regex=True)
]

In [None]:
# recall .between (inclusive)
df.loc[df.mag.between(6.5,7.5),['mag','magType','title','tsunami','type']]

In [None]:
# recall isin
df.loc[df.magType.isin(['mw','mwb']),['mag','magType','title','tsunami','type']]

### Additional methods to filter columns

In [None]:
df.shape

In [None]:
# Drop a column
df.drop(columns = 'ids').shape

In [None]:
df.drop(columns = ['ids', 'detail']).shape

In [None]:
df.filter(regex='.*mag.*')

In [None]:
# Doesn't update the original (use inplace if you want to update)
df.shape

### Broadcasting and Elementwise Arithmetic

In [None]:
# Broadcasting works on Pandas Series
df.mag * 3

In [None]:
# Broadcasting works on Pandas DataFrames
df[['mag','gap']] * 3

In [None]:
# Understanding elementwise arithmetic
df1 = pd.DataFrame({'A':[1,2,3], 'B':[3,4,5], 'C':[5,6,7]})
df2 = pd.DataFrame({'B':[5,6,7], 'C':[7,8,9], 'D':[9,10,11]}, index = [1,2,3])

In [None]:
df1

In [None]:
df2

In [None]:
# Elementsise arithmetic goes by labels
df1 + df2

In [None]:
# Series behave like a row in elementwise arithmetic
srs = pd.Series([1,2,3], index = ['A','B','C'])
df1 + srs

In [None]:
# Will not work with different labels
srs = pd.Series([1,2,3])
df1 + srs

In [None]:
# To do arithmetic along the other axis use the arithmetic methods
df1.add(srs,axis=0)

## MultiIndex DataFrames

In [None]:
# Originial DataFrame
olympics = pd.read_csv('athlete_events.csv')
olympics.head()

In [None]:
# If we wanted to filter United States entries for 2012 
olympics[(olympics.Team == 'United States') & (olympics.Year == 2012)]

In [None]:
# Using multiple columns as indices
multi_olympics = pd.read_csv('athlete_events.csv', index_col = ['Team','Year'])
multi_olympics.head()

In [None]:
# Indexing with MultiIndex dataframes
multi_olympics.loc['United States',2012]

In [None]:
# To just use the first level index
multi_olympics.loc['United States']

In [None]:
# To use the second level index
multi_olympics.loc[(slice(None), 2012),:]

In [None]:
# To use the second level index
multi_olympics.xs(key = 2012, level = 'Year')

In [None]:
# You can also swap the index levels
multi_olympics.swaplevel("Team", "Year").loc[2012]

## Plotting with Pandas

In [None]:
# histograms
df.plot(kind='hist',y='mag');

In [None]:
# line plots
df.plot(kind='line',x = 'time', y='mag');

In [None]:
# scatter plots
df.plot(kind='scatter',x='gap',y='mag');

In [None]:
# bar charts
df.value_counts('status').plot(kind='bar');

In [None]:
# another notation 
df.plot.hist(y='mag');

## Activity 
Create a DataFrame with two rows and 2 columns. The columns should be `mag` and `place`. The first row should contain the information for the smallest earthquake in California (lowest magnitude) and the second row should contain information for the largest earthquake) in California.

In [None]:
# This allows us to index with loc


How many earthquakes in the dataset had a red alert?

How many Oregon earthquakes are in the dataset?