# Effective Pandas

* Install
* Jupyter
* Pandas Overview
  * Load
  * Inspect
  * Tweak
  * Stats
  * Plotting
  * Filtering
  * Missing Data
  * Grouping
  * Joining
* Summary

## Install Pandas

I reccommend using conda or downloading Python using a virtual environment.

### Conda

* Install Anaconda (for Python 3) from anaconda.org
* Launch Ananconda Prompt (or terminal) and create an environment:
      conda create --name pandasclass python=3.7
* Activate the environment:
      conda activate pandasclass
* Install libraries:
      conda install notebook pandas seaborn xlrd openpyxl scipy
* Launch Jupyter:
      jupyter notebook
      
### Python.org 

* Install Python 3
* Launch a terminal or command prompt and create a virtual environment:
      python3 -m venv env
* Activate virtual environment 
  * Windows:
        env\Scripts\activate
  * Unix (Mac/Linux):
        source env/bin/activate
* Install libraries:
      pip install notebook pandas seaborn xlrd openpyxl scipy
* Launch Jupyter:
      jupyter notebook

## Jupyter

Two modes

### Command Mode

* A - Above
* B - Below
* CTL-Enter - Run
* C,X,V - Copy, cut, paste
* II - Interrupt Kernel
* 00 - Restart Kernel

### Edit Mode

* TAB - Completion
* Shift-TAB - Documentation (hit 4x to popup)
* ESC - Back to command mode w/o running
* CTL-Enter - Run

### Hints

* Add ? to functions and methods to see docs
* Add ?? to functions and methods to see source
* Add cell magic to make matplotlib plots show up:
      %matplotlib inline

## Quick Example

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np

In [None]:
url = 'https://github.com/mattharrison/datasets/raw/master/data/Presidents.xls'
pres = pd.read_excel(url)

In [None]:
pres

In [None]:
# Inspect data
pres.columns

In [None]:
pres.sample(10)

In [None]:
pres.dtypes

In [None]:
pres['% popular'].value_counts(dropna=False)

In [None]:
pres['% popular'].replace('NA()', '', regex=False).unique()

In [None]:
# Tweak
def clean_cols(val):
    return val.replace(' ', '_').replace('#', 'Num').replace('%', 'Per')

def clean_pop(df):
    col = (df
            ['Per_popular']
           .replace('NA()', np.nan)
           .astype(float)
          )
    return col.fillna(col.mean())

pres2 = (pres
  .rename(columns=clean_cols)
  .assign(date=lambda df_:pd.to_datetime(df_.Year_first_inaugurated, format='%Y'),
         Per_popular=clean_pop)
  #.assign(Years_in_office = lambda df_:df_.Years_in_office.astype(int))
)

In [None]:
# Stats
pres2.describe()

In [None]:
pres2.corr()

In [None]:
pres2.corr().style.background_gradient(cmap='RdBu', vmin=-1, vmax=1)

In [None]:
(pres2
 .Age_at_inauguration
 .plot.bar()
)

In [None]:
(pres2
 .set_index('President')
 .Age_at_inauguration
 .plot.barh(figsize=(6,6))
)

In [None]:
(pres2
 .set_index('President')
 .Age_at_inauguration
 .plot.barh(figsize=(6,6))
)

In [None]:
(pres2
 .Political_Party
 .value_counts()
 .plot.pie(figsize=(6,4))
)

In [None]:
(pres2
 .Political_Party
 .value_counts()
 .plot.pie(figsize=(6,4))
)

In [None]:
(pres2
 .Political_Party
 .value_counts()
 .plot.barh(figsize=(6,4))
)

In [None]:
(pres2
 .Occupation
 .value_counts()
 .plot.barh(figsize=(6,4))
)

In [None]:
(pres2
 .College
 .value_counts()
 .plot.barh(figsize=(6,4))
)

In [None]:
# Filtering
pres2[pres2.Years_in_office < 4]

In [None]:
pres2.Years_in_office.value_counts(dropna=False)

In [None]:
pres2[pres2.Years_in_office.isna()]

In [None]:
(pres2
.groupby('Political_Party')
.mean())

## Loading Data

In [None]:
url = 'https://github.com/mattharrison/datasets/raw/master/data/alta-noaa-1980-2019.csv'
alta = pd.read_csv(url)

In [None]:
alta

In [None]:
pd.re

In [None]:
# save as Excel
alta.to_excel('alta.xlsx')

In [None]:
pd.read_excel?

In [None]:
alta2 = pd.read_excel('alta.xlsx', index_col=0)

In [None]:
alta2

In [None]:
# save as SQL
import sqlite3
con = sqlite3.connect('alta.db')
con.execute('DROP TABLE IF EXISTS alta')
alta.to_sql('alta', con)

In [None]:
alta2 = pd.read_sql('SELECT * from alta', con)

In [None]:
alta2

## Inspecting Data

In [None]:
alta.columns

In [None]:
alta.SNOW.

In [None]:
alta["SNOW"]

In [None]:
alta.dtypes

In [None]:
alta.info()

In [None]:
# property vs method
alta.info

In [None]:
alta.dtypes()

In [None]:
alta.head()

In [None]:
alta.head().T

In [None]:
alta.sample(5)

In [None]:
alta.shape

In [None]:
alta.index

In [None]:
# doesn't show :(
with pd.option_context('display.min_rows', 3):
    alta

In [None]:
from IPython.display import display
with pd.option_context('display.min_rows', 3):
    display(alta)

In [None]:
from IPython.display import display
with pd.option_context('display.max_columns', 40):
    display(alta.T)

## Tweaking Data

In [None]:
alta.columns

In [None]:
# generally rename columns so I get attribute access (gives me completion in Jupyter)
# can use a dictionary or a function
def clean_col(name):
    return name.lower()

(alta
 .rename(columns=clean_col)
)

In [None]:
alta.dtypes

In [None]:
pd.to_datetime(alta.DATE)

In [None]:
alta.NAME.value_counts()

In [None]:
# many string utilities
alta.DATE.str.

In [None]:
# string manipulation using .str
alta.DATE.str.slice(0,4)

In [None]:
# string manipulation then convert to number
alta.DATE.str.slice(0,4).astype(int)

In [None]:
# in this case can also convert to date and pull off year attribute from .dt
pd.to_datetime(alta.DATE).dt.year

In [None]:
def to_celsius(val):
    return (val - 32) * 5/9

alta.TOBS.apply(to_celsius)

In [None]:
%%timeit
alta.TOBS.apply(to_celsius)

In [None]:
%%timeit
(alta.TOBS - 32) * 5/9

In [None]:
def clean_col(name):
    return name.lower()


def tweak_alta(df):
    return (df
            .rename(columns=clean_col)
            .assign(date=lambda df_:pd.to_datetime(df_.date),
                    year=lambda df_: df_.date.dt.year,
                    tobs_c=lambda df_: (df_.tobs - 32) * 5/9
                   )
            [['date', 'prcp', 'snow', 'snwd', 'tmax', 'tmin', 'tobs', 'tobs_c']]
    )
alta2 = tweak_alta(alta)

In [None]:
alta2

## Stats

In [None]:
alta2.describe()

In [None]:
# various aggregation functions (min, max, mean, median, kurt, skew)
alta2.mean()

In [None]:
alta2.snow.quantile(.7)

In [None]:
alta2.snow.quantile([.1,.9])

In [None]:
alta2.agg(['kurt', 'skew', 'mean', 'median'])

In [None]:
alta2.corr()

In [None]:
alta2.corr().style.background_gradient(cmap='RdBu', vmin=-1, vmax=1)

In [None]:
alta2.prcp.corr(alta2.snow)

## Plotting

In [None]:
%matplotlib inline

In [None]:
alta2.plot()

In [None]:
alta2.dtypes

In [None]:
(alta2
.drop(columns='date')
.plot())

In [None]:
# plots index in x, so why not put the date in the index?
(alta2
.set_index('date')
.plot())

In [None]:
# plots index in x, so why not put the date in the index?
(alta2
.set_index('date')
.plot())

In [None]:
# plots index in x, so why not put the date in the index?
(alta2
.set_index('date')
.filter(regex=r'^t.*', axis='columns')
.plot())

In [None]:
(alta2
.set_index('date')
.filter(regex=r'^t.*', axis='columns')
.plot(figsize=(10,4)))

In [None]:
(alta2
.set_index('date')
.filter(regex=r'^t.*', axis='columns')
.resample('W')
.mean()
.plot(figsize=(10,4))
)

In [None]:
# use the docs to get links like this
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
alta2.resample?

In [None]:
_ = (alta2
.set_index('date')
.filter(regex=r'^t.*', axis='columns')
.resample('M')
.mean()
.iloc[-100:]
.plot(figsize=(10,4))
)

In [None]:
import matplotlib.font_manager as fm
fm.fontManager.ttflist
# append to this list if you want to add a font
#fm.fontManager.ttflist += fm.createFontList(['thsarabunnew-webfont.ttf'])

In [None]:
import matplotlib.pyplot as plt
font = 'Sinkin Sans'
with plt.style.context('fivethirtyeight'):    
    with plt.style.context({'font.family':font}):
        _ = (alta2
        .set_index('date')
        .filter(regex=r'^t.*', axis='columns')
        .resample('M')
        .mean()
        .iloc[-100:]
        .plot(figsize=(10,4), title='Temperatures at Alta')
        .legend(bbox_to_anchor=(1,1))
        )

In [None]:
# bar plot - avg per month
(alta2
 .groupby(alta2.date.dt.month_name())
 .snow
 .mean()
 .plot.barh()
)

In [None]:
# bar plot - avg per month
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
          'August', 'September', 'October', 'November', 'December']
MonthType = pd.CategoricalDtype(categories=months, ordered=True)
_ = (alta2
 .assign(month=alta2.date.dt.month_name().astype(MonthType))
 .groupby('month')
 .snow
 .mean()
 .plot.barh()
)

In [None]:
# add style, title, clean ylabel
with plt.style.context('fivethirtyeight'):
    with plt.style.context({'font.family':font}):
        ax = (alta2
         .assign(month=alta2.date.dt.month_name().astype(MonthType))
         .groupby('month')
         .snow
         .mean()
         .plot.barh(title="Average Daily Snowfall(in) at Alta")
        )
        ax.set_ylabel('')

In [None]:
# pie chart

months = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
          'August', 'September', 'October', 'November', 'December']
MonthType = pd.CategoricalDtype(categories=months, ordered=True)
_ = (alta2
 .assign(month=alta2.date.dt.month_name().astype(MonthType))
 .groupby('month')
 .snow
 .mean()
 .plot.pie()
)

In [None]:
# scatter plot
(alta2
  .plot.scatter(x='tobs', y='snwd'))

In [None]:
(alta2[alta2.tobs>100])

In [None]:
# scatter plot
(alta2
  .sample(2_000)
  .plot.scatter(x='tobs', y='snwd', alpha=.2))

In [None]:
# scatter plot
(alta2
  .sample(2_000)
  .plot.scatter(x='tobs', y='snow', alpha=.2))

In [None]:
alta2.query('snow > 0').tobs.max()

In [None]:
# scatter plot
with plt.style.context('fivethirtyeight'):
    (alta2
      .sample(2_000)
      .plot.scatter(x='tobs', y='snow', alpha=.2,
                   title='Snowfall vs Temp (F)'))

In [None]:
(alta2
  .query('snow > 0')
  .tobs
  .plot.hist())

In [None]:
with plt.style.context('fivethirtyeight'):
    with plt.style.context({'font.family':font}):
        (alta2
          .query('snow > 0')
          .tobs
          .plot.hist(title='Temp (F) during Snowfall'))

## Filtering

In [None]:
alta2[alta2.tobs > 100]

In [None]:
alta2.query('tobs > 100')

In [None]:
months = ['December', 'January']
alta2.query('date.dt.month_name().isin(@months)').date.dt.month_name().value_counts()

In [None]:
months = ['December', 'January']
alta2.query('date.dt.month_name().isin(@months) and snow > 10').describe()

In [None]:
jan_dec = alta2.date.dt.month_name().isin(months)
gt_10 = alta2.snow > 10
alta2[jan_dec and gt_10]

In [None]:
jan_dec = alta2.date.dt.month_name().isin(months)
gt_10 = alta2.snow > 10
alta2[jan_dec & gt_10]

In [None]:
alta2[alta2.date.dt.month_name().isin(months) & alta2.snow > 10]

In [None]:
alta2[alta2.date.dt.month_name().isin(months) & (alta2.snow > 10)]

In [None]:
alta2.date.dt.month_name().isin(months) & alta2.snow

In [None]:
alta2.date.dt.month_name().isin(months) & alta2.snow > 10

In [None]:
# say I want to plot snow against date
# pull out a single column
(alta2
  .set_index('date')
  .snow)

In [None]:
# say I want to plot snow against date
# pull out a single column
(alta2
  .set_index('date')
  ['snow'])

In [None]:
# say I want to plot snow against date
# pull out a single column - Not a series
(alta2
  .set_index('date')
  [['snow']])

In [None]:
# say I want to plot snow against date
# pull out a single column
(alta2
  .set_index('date')
  .loc[:, 'snow'])

In [None]:
# say I want to plot snow against date
# pull out a single column
(alta2
  .set_index('date')
  .iloc[:, 2])

### LOC Indexing

In [None]:
# by label
# dates support "Partial String Indexing"
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#partial-string-indexing
# This will go to end of 2016. Use datetime objects to specify exact match (or longer string)
(alta2
  .set_index('date')
  .loc['2015':'2016', 'snow'])

In [None]:
# by label
# dates support "Partial String Indexing"
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#partial-string-indexing
# This will go to end of 2016. Use datetime objects to specify exact match (or longer string)
(alta2
  .set_index('date')
  .loc['2015':'2016', 'snow':'snwd'])

In [None]:
# by label
# with lists & partial strings
(alta2
  .set_index('date')
  .loc[['2015','2016'], ['snow','snwd']])

In [None]:
# by label
# with lists
(alta2
  .set_index('date')
  .loc[['2015-11-01','2016-07-04'], ['snow','snwd']])

In [None]:
# by label
# with lists
import datetime
(alta2
  .set_index('date')
  .loc[[datetime.datetime(2015,11,1), datetime.datetime(2016,7,1)],
       ['snow','snwd']])

In [None]:
# by label
# select all rows
(alta2
  .set_index('date')
  .loc[:, ['snow','snwd']])

In [None]:
# by label
# use boolean array
gt10 = (alta2
       .set_index('date')
       .snow > 10)
(alta2
  .set_index('date')
  .loc[gt10, ['snow','snwd']])

In [None]:
alta2.loc?

In [None]:
# by function
# use boolean array
(alta2
  .set_index('date')
  .loc[lambda df_:df_.snow > 10, ['snow','snwd']])

In [None]:
# by function
# use boolean array - useful when intermediate dataframe/series has changed
(alta2
  .set_index('date')
  .rename(columns=str.upper)
  .loc[lambda df_:df_.SNOW > 10])

### ILOC Indexing

In [None]:
# iloc is by 
(alta2
  .set_index('date')
  .iloc[:, 2])

In [None]:
# iloc is by 
(alta2
  .set_index('date')
  .iloc[12_500:13_500:, 2])

In [None]:
# useful for pulling off first/last 
(alta2
  .iloc[-200:])

In [None]:
# note this follows "half-open interval" includes start index but not end
(alta2
  .iloc[10:20, 2:6])

In [None]:
# by contrast loc is the closed interval (includes 20 and tmin)
(alta2
  .loc[10:20, 'snow':'tmin'])

In [None]:
# doesn't work with boolean arrays
gt10 = alta2.snow > 10
(alta2
 .iloc[gt10])

In [None]:
# however does work with numpy?!
gt10 = alta2.snow > 10
(alta2
 .iloc[gt10.to_numpy()])

## Missing Data

In [None]:
alta2.isna()

In [None]:
# columns with missing data (apply any to axis 0)
alta2.isna().any()

In [None]:
# columns with missing data
alta2.isna().sum()

In [None]:
# columns with missing data
alta2.isna().mul(100).mean()

In [None]:
# where any part of row is missing
alta2.isna().any(axis=1)

In [None]:
# where any part of row is missing
alta2[alta2.isna().any(axis=1)]

In [None]:
# see where snow depth is missing
alta2.query('snwd.isna()')

In [None]:
(alta2
 .set_index('date')
 .loc['1980-04':'1980-05']
 .snwd
 .plot()
)

In [None]:
# dealing with missing data
# Talk to an SME!
# - Drop
# - Impute
# - (Add indicator column)
(alta2
 .set_index('date')
 .loc['1980-04':'1980-05']
 .snwd
 .dropna()
 .plot()
)

In [None]:
alta2.snwd.fillna?

In [None]:
# dealing with missing data
# Talk to an SME!
# - Drop
# - Impute
# - (Add indicator column)
(alta2
 .set_index('date')
 .loc['1980-04':'1980-05']
 .snwd
 .fillna(alta2.snwd.mean())
 .plot()
)

In [None]:
# dealing with missing data
# Talk to an SME!
# - Drop
# - Impute
# - (Add indicator column)
(alta2
 .set_index('date')
 .loc['1980-04':'1980-05']
 .snwd
 .interpolate()
 .plot()
)

In [None]:
# dealing with missing data
# Talk to an SME!
# - Drop
# - Impute
# - (Add indicator column)
(alta2
 .set_index('date')
 .loc['1980-04':'1980-05']
 .snwd
 .ffill()
 .plot()
)

In [None]:
# dealing with missing data
# Talk to an SME!
# - Drop
# - Impute
# - (Add indicator column)
(alta2
 .set_index('date')
 .loc['1980-04':'1980-05']
 .snwd
 .bfill()
 .plot()
)

In [None]:
# Add indicator column
(alta2
.assign(snwd_missing=alta2.snwd.isna(),
        snwd=lambda df_:df_.snwd.interpolate())
)

In [None]:
# dealing with missing data
# Talk to an SME!
# - Drop
# - Impute
# - (Add indicator column)
(alta2
 .set_index('date')
 .loc[:'1984-08']
 .snwd
 .interpolate()
 .plot(figsize=(10,4))
)

In [None]:
# dealing with missing data
# Talk to an SME!
# - Drop
# - Impute
# - (Add indicator column)
(alta2
 .set_index('date')
 .loc['1984-08':'1990-08']
 .snwd
 .interpolate()
 .plot(figsize=(10,4))
)

In [None]:
# dealing with missing data
# Talk to an SME!
# - Drop
# - Impute
# - (Add indicator column)
(alta2
 .set_index('date')
 .loc['1984-08':'1985-08']
 .snwd
 #.bfill()
 .plot(figsize=(10,4))
)

## Grouping

In [None]:
# sum of the snowfall by year
(alta2
  .groupby(alta2.date.dt.year)
  .snow
  .sum()
)

In [None]:
# sum of the snowfall by year
(alta2
  .groupby(alta2.date.dt.year)
  .snow
  .sum()
  .plot()
)

In [None]:
# maximum snow depth by year
(alta2
  .groupby(alta2.date.dt.year)
  .snwd
  .max()
)

In [None]:
# maximum snow depth by year
(alta2
  .groupby(alta2.date.dt.year)
  .snwd
  .max()
  .plot()
)

In [None]:
# maximum snow depth by month
(alta2
  .groupby(alta2.date.dt.month)
  .snwd
  .max()
  .plot()
)

In [None]:
# maximum snow depth by month of each year
(alta2
  .groupby(pd.Grouper(key='date', freq='m'))
  .snwd
  .max()
  .plot()
)

In [None]:
pd.tseries.offsets.ccalendar.MONTH_ALIASES

In [None]:
pd.offsets.YearOffset(month=9)

In [None]:
# maximum snowfall by skiseason
(alta2
  .groupby(pd.Grouper(key='date', freq='A-JUN'))
  .snow
  .sum()
  .plot()
)

In [None]:
# groupby by multiple columns
(alta2
 .groupby([alta2.date.dt.year, alta2.date.dt.month])
 .snwd
 .mean()
)

In [None]:
# groupby by multiple columns
(alta2
 .groupby([alta2.date.dt.year, alta2.date.dt.month])
 .snwd
 .mean()
 .unstack()
)

In [None]:
# groupby by multiple columns
(alta2
 .groupby([alta2.date.dt.year, alta2.date.dt.month])
 .snwd
 .mean()
 .unstack()
 .plot()
)

In [None]:
# groupby by multiple columns
with plt.style.context('fivethirtyeight'):
    with plt.style.context({'font.family':font}):
        (alta2
         .groupby([alta2.date.dt.year.rename('year'), alta2.date.dt.month.rename('month')])
         .snwd
         .mean()
         .unstack()
         .rename(columns=dict(enumerate(
             'Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec'.split(','), 1)))
         .interpolate()
         .plot(cmap='jet', figsize=(12,4), title='Average Monthly Snowdepth (in)')
         .legend(bbox_to_anchor=(1,1))
        )

In [None]:
# groupby by multiple columns
(alta2
 .groupby([alta2.date.dt.year.rename('year'), alta2.date.dt.month.rename('month')])
 .snwd
 .mean()
 .unstack()
 .rename(columns=dict(enumerate(
     'Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec'.split(','), 1)))
 .interpolate()
 .plot.bar(cmap='jet', figsize=(12,4))
 .legend(bbox_to_anchor=(1,1))
)

In [None]:
# can have multiple columns
(alta2
 .groupby([alta2.date.dt.year.rename('year'), alta2.date.dt.month.rename('month')])
 [['snwd', 'snow']]
 .mean()
)

In [None]:
# can have multiple columns and multiple aggregations
def first(s):
    return s.iloc[0]
(alta2
 .groupby([alta2.date.dt.year.rename('year'), alta2.date.dt.month.rename('month')])
 [['snwd', 'snow']]
 .agg([first, 'median', 'std', 'max'])
)

In [None]:
# or per column aggregations
def first(s):
    return s.iloc[0]
(alta2
 .groupby([alta2.date.dt.year.rename('year'), alta2.date.dt.month.rename('month')])
 .agg({'snow':['mean'], 'snwd':['max', 'min'], 'tobs':[first]})
)

In [None]:
# flatten columns
def first(s):
    return s.iloc[0]

def to_flat_cols(df_):
    cols = ['_'.join(cs) for cs in df_.columns.to_flat_index()]
    df_.columns = cols
    return df_

(alta2
 .groupby([alta2.date.dt.year.rename('year'), alta2.date.dt.month.rename('month')])
 .agg({'snow':['mean'], 'snwd':['max', 'min'], 'tobs':[first]})
 .pipe(to_flat_cols)
)

In [None]:
# flatten index
def first(s):
    return s.iloc[0]

def to_flat_cols(df_):
    cols = ['_'.join(cs) for cs in df_.columns.to_flat_index()]
    df_.columns = cols
    return df_

(alta2
 .groupby([alta2.date.dt.year.rename('year'), alta2.date.dt.month.rename('month')])
 .agg({'snow':['mean'], 'snwd':['max', 'min'], 'tobs':[first]})
 .pipe(to_flat_cols)
 .reset_index()
)

## Joining Data

In [None]:
# going to join with particulate matter data for 2004
# http://www.airmonitoring.utah.gov/dataarchive/archpm25.htm
# and see if there is a correlation
url = 'http://www.airmonitoring.utah.gov/dataarchive/2004-PM2.5.csv'
part_df = pd.read_csv(url)

In [None]:
part_df

In [None]:
part_df.HW.value_counts()

In [None]:
part_df.HW.str.extract(r'([^0-9\-\.])', expand=False).value_counts()

In [None]:
part_df[~part_df.HW.str.extract(r'([^0-9\-\.])', expand=False).isna()]#.value_counts()

In [None]:
# going to join with particulate matter data for 2004
# http://www.airmonitoring.utah.gov/dataarchive/archpm25.htm
# and see if there is a correlation
url = 'http://www.airmonitoring.utah.gov/dataarchive/2004-PM2.5.csv'
part_df = pd.read_csv(url, skiprows=2, parse_dates=[0])
part_df

In [None]:
part_df.dtypes

In [None]:
alta2

In [None]:
part_df2 = part_df.groupby(pd.Grouper(key='Date', freq='D')).mean().reset_index()
part_df2

In [None]:
part_df2.set_index('Date').plot()

In [None]:
alta2.join(part_df2)

In [None]:
(alta2
 .join(part_df2)
 .set_index('date')
 .loc[lambda df_:df_.index.year==2004]
 .isna().mul(100).mean()
)

In [None]:
(alta2
 .merge(part_df2, left_on='date', right_on='Date')
 .set_index('date')
 .loc[lambda df_:df_.index.year==2004]
 .isna().mul(100).mean()
)

In [None]:
(alta2
 .merge(part_df2, left_on='date', right_on='Date')
 .corr()
 .style.background_gradient(cmap='RdBu', vmin=-1, vmax=1)
)

In [None]:
(alta2
 .merge(part_df2, left_on='date', right_on='Date')
 .plot.scatter(x='UG/M3', y='snwd', alpha=.2)
)

In [None]:
(alta2
 .merge(part_df2, left_on='date', right_on='Date')
 .set_index('date')
 [['snwd', 'UG/M3']]
 .plot()
)

In [None]:
# join types demo
from IPython.display import display
df1 = pd.DataFrame({'name': ['Fred', 'Suzy', 'Suzy', 'Bob'],
                   'pet':['Dog', 'Dog', 'Cat', 'Fish']})

df2 = pd.DataFrame({'Name': ['Suzy', 'Suzy', 'Suzy', 'Fred', 'Joe', 'Joe'],
                   'Color': ['Black', 'Blue', 'Red', 'Green', 'Yellow', 'Blue']})
display(df1)
display(df2)

In [None]:
df1.merge(df2)

In [None]:
# default "how" is inner
df1.merge(df2.assign(name=df2.Name))

In [None]:
# default "how" is inner
df1.merge(df2.assign(name=df2.Name), how='left')

In [None]:
# default "how" is inner
df1.merge(df2.assign(name=df2.Name), how='right')

In [None]:
# default "how" is inner
df1.merge(df2.assign(name=df2.Name), how='outer')

In [None]:
# default "how" is inner
df1.merge(df2.assign(name=df2.Name), how='left', validate='1:1')

In [None]:
(df1
 .drop_duplicates(subset='name')
)

In [None]:
# default "how" is inner
(df1
 .drop_duplicates(subset='name')
 .merge(df2
        .drop_duplicates(subset='Name')
        .assign(name=df2.Name), how='left', validate='1:1')
)