# Seaborn Intro

In [None]:
import re

import pandas as pd
import seaborn as sns
import seaborn.objects as so

In [None]:
!pip install -U seaborn

In [None]:
sns.__version__

## Getting the Data

In [None]:
df = pd.read_csv('https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip')
df

In [None]:
def to_camelcase(val):
    return re.sub(r'([A-Z])', r'_\1', val).lower().lstrip('_')

def tweak_autos(autos):
    cols = ['barrels08', 'city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr', 
        'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year', 'VClass']
    return (autos
     [cols]
     .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),
             displ=autos.displ.fillna(0).astype('float16'),
             drive=autos.drive.fillna('Other').astype('category'),
             automatic=autos.trany.str.contains('Auto'),
             speeds=autos.trany.str.extract(r'(\d)+').fillna('20').astype('int8'),
             createdOn=pd.to_datetime(autos.createdOn.replace({' EDT': '-04:00',
                ' EST': '-05:00'}, regex=True), utc=True).dt.tz_convert('America/New_York'),
             ffs=autos.eng_dscr.str.contains('FFS')
            )
     .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16',
              'range': 'int16',  'year': 'int16', 'make': 'category', 'VClass': 'category'})
     .drop(columns=['trany', 'eng_dscr'])
    )

autos = tweak_autos(df)

In [None]:
autos

## Create an Object for Help

In [None]:
p = so.Plot()

In [None]:
p.

In [None]:
print(dir(p))

In [None]:
p.add?

In [None]:
so.Bar?

In [None]:
so.Bars?

In [None]:
print(dir(so))

## Catgoricals

In [None]:
autos.dtypes

In [None]:
autos.VClass.value_counts()

In [None]:
(so
 .Plot(autos, x="VClass")
 .add(so.Bar(), so.Hist())
)

In [None]:
(so
 .Plot(autos, y="VClass")
 .add(so.Bar(), so.Hist())
)

In [None]:
p.add?

In [None]:
def limit_top_n(df, col, n=10, default='Other'):
    is_cat = isinstance(df[col].dtype, pd.CategoricalDtype)
    top_n = df[col].value_counts().index[:n]
    ser = df[col]
    if is_cat:
        ser = ser.cat.add_categories([default])
    res = (ser
           .where(ser.isin(top_n), default)
           )
    if is_cat:
        res = res.cat.remove_unused_categories()
    return res
limit_top_n(autos, 'VClass')

In [None]:
(so
 .Plot(autos.assign(VClass=limit_top_n(autos, 'VClass')), y="VClass")
 .add(so.Bar(), so.Hist())
)

In [None]:
# facet
(so
 .Plot(autos.assign(VClass=limit_top_n(autos, 'VClass')), y="VClass")
 .facet('year', wrap=5)#, order=[1990, 2000, 2010])
 .add(so.Bar(), so.Hist(), orient='y')
)

In [None]:
# facet limit
(so
 .Plot(autos.assign(VClass=limit_top_n(autos, 'VClass')), y="VClass")
 .facet('year', wrap=2, order=[1990, 2000, 2010, 2015])
 .add(so.Bar(), so.Hist(), orient='y')
)

## Categorical Exercise
* Create a categorical plot from the data

## Continuous Variable

In [None]:
(so
 .Plot(autos, x='city08')
 .add(so.Line(), so.Hist())
)

In [None]:
(so
 .Plot(autos, x='city08')
 .add(so.Bars(), so.Hist())
)

In [None]:
(so
 .Plot(autos, x='city08')
 .add(so.Bars(), so.Hist())
 .limit(x=[0,40])
)

In [None]:
# change bins (doesn't take limit into account)
(so
 .Plot(autos, x='city08')
 .add(so.Bars(), so.Hist(bins=40))
 .limit(x=[0,40])
)

In [None]:
# Facetting
(so
 .Plot(autos, x='city08')
 .facet(col='year', order=[1985, 1990, 1995, 2000, 2005, 2010, 2015], wrap=4)
 .add(so.Bars(), so.Hist(bins=40))
 .limit(x=[0,40]) 
)

In [None]:
# color by year
(so
 .Plot(autos, x='city08', color='year')
 .add(so.Bars(), so.Hist(bins=40))
 .limit(x=[0,40]) 
)

In [None]:
# Try out a line plot
(so
 .Plot(autos, x='city08', color='year')
 .add(so.Lines(), so.Hist(bins=100))
 .limit(x=[0,40]) 
)

## Continuous Exercise
* Choose a numeric column to visualize

## Continuous - Continuous

In [None]:
(so
 .Plot(autos, x='displ', y='comb08')
 .add(so.Dots())
)

In [None]:
(so
 .Plot(autos, x='displ', y='comb08')
 .add(so.Dots())
 .scale(y='log')
)

In [None]:
(so
 .Plot(autos, x='displ', y='comb08')
 .add(so.Dots(alpha=.01))
)

In [None]:
(so
 .Plot(autos, x='displ', y='comb08')
 .add(so.Dots(alpha=.01))
 .add(so.Line(), so.PolyFit())
)

In [None]:
(so
 .Plot(autos, x='displ', y='comb08')
 .add(so.Dots(alpha=.01))
 .add(so.Line(color='red'), so.PolyFit(order=1))
)

In [None]:
# color by cylinders
(so
 .Plot(autos, x='displ', y='comb08', color='cylinders')
 .add(so.Dots(alpha=.01))
 .add(so.Line(), so.PolyFit())
)

In [None]:
# change colormap with `.scale`
(so
 .Plot(autos, x='displ', y='comb08', color='cylinders')
 .add(so.Dots(alpha=.01))
 .add(so.Line(), so.PolyFit())
 .scale(color='viridis')
)

In [None]:
autos.cylinders.corr(autos.comb08)

In [None]:
# side exploration to check out relationship between
# disp and cylinders

autos.cylinders.corr(autos.displ)

In [None]:
# side exploration to check out relationship between
# disp and cylinders
(so
 .Plot(autos, x='displ', y='cylinders')
 .add(so.Dots(alpha=.01))
 .add(so.Line(), so.PolyFit())
)

In [None]:
# color by year
(so
 .Plot(autos, x='displ', y='comb08', color='year')
 .add(so.Dots(alpha=.01))
 .add(so.Line(), so.PolyFit(order=1))
)

In [None]:
# color by year but...
# add line fit for everything (note color parameter for last add)
(so
 .Plot(autos, x='displ', y='comb08', color='year')
 .add(so.Dots(alpha=.01))
 .add(so.Line(), so.PolyFit(order=1), color=None)
)

In [None]:
(autos.make == 'Ford').value_counts()

In [None]:
# compare Ford to everything else
import numpy as np
(so
 .Plot(autos.assign(ford=np.select([autos.make=='Ford'], ['Ford'], 'Other')),
       x='displ', y='comb08', color='ford')
 .add(so.Dots(alpha=.01))
 .add(so.Line(), so.PolyFit(order=1))
)

In [None]:
# plot top makes in facet
# first add plots everything in each cell (because of col=None)
# second add plots the make in red
(so
 .Plot(autos.assign(top_makes=limit_top_n(autos, 'make', n=5)),
       x='displ', y='comb08')
 .facet('top_makes', wrap=3)
 .add(so.Dots(alpha=.01, pointsize=2), col=None)
 .add(so.Dots(color='red', alpha=.1, pointsize=3))
)

In [None]:
# make y log-scale
(so
 .Plot(autos.assign(top_makes=limit_top_n(autos, 'make', n=5)),
       x='displ', y='comb08')
 .facet('top_makes', wrap=3)
 .add(so.Dots(alpha=.01, pointsize=2), col=None)
 .add(so.Dots(color='red', alpha=.1, pointsize=3))
 .scale(y='log')
)

In [None]:
# make x and y log-scale
# (note log of 0 disappers!)
(so
 .Plot(autos.assign(top_makes=limit_top_n(autos, 'make', n=5)),
       x='displ', y='comb08')
 .facet('top_makes', wrap=3)
 .add(so.Dots(alpha=.01, pointsize=2), col=None)
 .add(so.Dots(color='red', alpha=.1, pointsize=3))
 .scale(y='log', x='log')
)

In [None]:
# color by year

(so
 .Plot(autos.assign(top_makes=limit_top_n(autos, 'make', n=5)),
       x='displ', y='comb08', color='year')
 .facet('top_makes', wrap=3)
 .add(so.Dots(color='grey', 
              alpha=.01, pointsize=2), col=None)
 .add(so.Dots(#color='red',
              alpha=.1, pointsize=3))
 .scale(y='log', x='log', color='viridis')
)

In [None]:
p.pair?

In [None]:
# Use .pair to facet
(so
 .Plot(autos.assign(top_makes=limit_top_n(autos, 'make', n=5)), 
       y='city08', color='top_makes')
 .pair(x=['cylinders', 'speeds', 'displ', 'barrels08'], wrap=2)
 .add(so.Dots(alpha=.3))
)

In [None]:
# color by year
(so
 .Plot(autos.assign(top_makes=limit_top_n(autos, 'make', n=5)), y='city08', color='year')
 .pair(x=['cylinders', 'speeds', 'displ', 'barrels08'], wrap=2)
 .add(so.Dots(alpha=.1))
)

## Cont/cont Exercise
* Create a plot to compare two continuous variables

## Time Data

In [None]:
(so
 .Plot(autos, x='year', y='highway08')
 .add(so.Line(), so.Agg())  # mean
)

In [None]:
(so
 .Plot(autos.assign(top_makes=limit_top_n(autos, 'make', n=5)),
       x='year', y='highway08', color='top_makes')
 .add(so.Line(), so.Agg())  # mean
)

In [None]:
(so
 .Plot(autos, x='year', y='highway08')
 .add(so.Line(), so.Agg(func='max'))
)

In [None]:
(so
 .Plot(autos, x='year', y='highway08')
 .add(so.Line(), so.Agg(func=lambda data: data.quantile(.8)))
)

In [None]:
(so
 .Plot(autos.assign(top_makes=limit_top_n(autos, 'make', n=5)),
       x='year', y='highway08')
 .facet('top_makes', wrap=3)
 .add(so.Line(), so.Agg())
)

In [None]:
(so
 .Plot(autos.assign(top_makes=limit_top_n(autos, 'make', n=5)),
       x='year', y='highway08')
 .facet('top_makes', wrap=3)
 .add(so.Line(), so.Agg())
 .add(so.Line(alpha=.2), so.Agg(), group='top_makes', col=None)
)

## Time Series Exercise
* Create a time series plot from the data

## Extra

In [None]:
# Set ticks
(so
 .Plot(autos, x='displ', y='cylinders')
 .add(so.Dots(alpha=.01))
 .scale(x=so.Continuous().tick(at=[0,5,10]),
        y=so.Continuous().tick(at=[0,3,6,9,12]))
)

In [None]:
# Set limit
(so
 .Plot(autos, x='displ', y='cylinders')
 .add(so.Dots(alpha=.01))
 .scale(y=so.Continuous().tick(at=[0,1,2,4,6,8])
             .label(like='{x:.0f} cylinders'))
 .limit(y=(-1,9))
)

In [None]:
# Set title
(so
 .Plot(autos, x='displ', y='cylinders')
 .add(so.Dots(alpha=.01))
 .scale(y=so.Continuous().tick(at=[0,1,2,4,6,8])
             .label(like='{x:.0f} cylinders'))
 .limit(y=(-1,9))
 .label(title='Cylinders per Displacement')
)

In [None]:
# Looks like this is still WIP
# Plot on figure
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8,3))
(so
 .Plot(autos, x='displ', y='cylinders')
 .add(so.Dots(alpha=.01))
 .scale(y=so.Continuous().tick(at=[0,1,2,4,6,8])
             .label(like='{x:.0f} cyl'))
 .limit(y=(-1,9))
 .label(title='Cylinders per Displacement')
 .on(fig)
 .plot()
)


## Summary

New Seaborn interface

* Is completely new
* Provides consistency
* Integrates with Pandas and Matplotlib
* Might change in future
