# Idiomatic Pandas
## Better Pandas Code


https://github.com/mattharrison/talks

## About Matt  Harrison @\_\_mharrison\_\_

* Author of Effective Pandas, Machine Learning Pocket Reference, and Learning Python for Data.
* Advisor at Ponder (creators of Modin) - sold to Snowflake
* Corporate trainer at MetaSnake. Taught Pandas to 1000's of students.
* Upcoming Live Courses <a href='store.metasnake.com'>at MetaSnake</a>

## Pandas Background

* 2000 NLP
* 2006 Created Python OLAP Engine
* 2009 Heard about Pandas
* Used Pandas for failure modeling, analytics, and ml
* 2016 Learning the Pandas Library
* 2019 Spark
* 2020 Pandas Cookbook
* 2021 Effective Pandas
* 2022 CuDf, Modin, Polars
* 2023 Pandas 2.0

## Outline of Opinions

* Load Data
* Types
* Chaining
* Mutation
* Apply
* Aggregation

## Data

In [None]:
!pip install -U pandas pyarrow

In [None]:
from IPython.display import display
import numpy as np
import pandas as pd
#import modin.pandas as pd

In [None]:
pd.__version__

In [None]:
pd.options.display.min_rows = 20

In [None]:
autos = pd.read_csv('https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip',
                   dtype_backend='pyarrow',
                   engine='pyarrow')

In [None]:
autos = pd.read_csv('/Users/matt/Downloads/vehicles.csv.zip',
                   dtype_backend='pyarrow',
                   engine='pyarrow')

In [None]:
# a glorious function
def tweak_autos(autos):
    cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr', 
        'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']
    return (autos
     [cols]
     .assign(cylinders=autos.cylinders.fillna(0).astype('int8[pyarrow]'),
             displ=autos.displ.fillna(0).astype('float32[pyarrow]'),
             drive=autos.drive.fillna('Other').astype('category'),
             automatic=autos.trany.astype(str).str.contains('Auto'),
             speeds=autos.trany.astype(str).str.extract(r'(\d+)').fillna('20').astype('int8[pyarrow]'),
             createdOn=pd.to_datetime(autos
                .createdOn
                .replace({' EDT': ' -0400', ' EST': ' -0500'}, regex=True),
                          format='%a %b %d %H:%M:%S %z %Y', utc=True)
                .dt.tz_convert('America/New_York'),
             ffs=autos.eng_dscr.str.contains('FFS')
            )
     .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'int16[pyarrow]', 'fuelCost08': 'int16[pyarrow]',
              'range': 'int16[pyarrow]',  'year': 'int16[pyarrow]', 'make': 'category'})
     .loc[:, ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive',
       'fuelCost08', 'make', 'model', 'range', 'createdOn', 'year',
       'automatic', 'speeds', 'ffs']]
    )

tweak_autos(autos)

In [None]:
autos.columns

In [None]:
# 68 Megs (w/ Pandas 1.x)
# 29 M (Pandas 2)
autos.memory_usage(deep=True).sum()

## Exercise

* View the documentation for the `.info` method (use `df.info??`)
* Run the `.info()` method
* Run the `.describe()` method

## Types
Getting the right types will enable analysis and correctness.

In [None]:
cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr', 
        'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']

In [None]:
autos[cols].dtypes

In [None]:
autos[cols].memory_usage(deep=False)

In [None]:
# 19 Megs (Pandas 1)
# 7 Megs (Pandas 2)
autos[cols].memory_usage(deep=True).sum()

### Ints

In [None]:
autos[cols].select_dtypes(int).describe()

In [None]:
# chaining
(autos
 [cols]
 .select_dtypes(int)
 .describe()
)

In [None]:
# can comb08 be an int8?
np.iinfo(np.int8)

In [None]:
# no but maybe a uint8
np.iinfo(np.uint8)

In [None]:
# chaining
(autos
 [cols]
 .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'uint8[pyarrow]' })
 #.select_dtypes([int, 'int8[pyarrow]'])  # only int64 or int8
 .select_dtypes('integer')  # all integers
 .describe()
)

## Integer Exercise
* Find the 90% `.quantile` for the integer columns
* Find the *spearman* correlation coefficient (`.corr`) for the integer columns

## Other int types

In [None]:
# Fancy query to select columns where row "max" is < 255
(autos
 [cols]
 .describe()
 .loc[:, lambda a_df: a_df.loc['max'] < 255]
)

In [None]:
# chaining
# use 'integer' so see all int-like columns
(autos
 [cols]
 .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'int16[pyarrow]', 'fuelCost08': 'int16[pyarrow]', 
          'range': 'int16[pyarrow]', 'year': 'int16[pyarrow]'})
 .select_dtypes(['integer'])  # see https://numpy.org/doc/stable/reference/arrays.scalars.html
 .describe()
)

In [None]:
# chaining
(autos
 [cols]
 .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 
          'comb08': 'int16[pyarrow]', 'fuelCost08': 'int16[pyarrow]', 
          'range': 'int16[pyarrow]', 'year': 'int16[pyarrow]'})
 .memory_usage(deep=True)
 .sum()  # was 19,647,323
)

### Floats

In [None]:
(autos
[cols]
.select_dtypes('float'))

In [None]:
# In pandas 1 this would be float
autos.cylinders.dtype

In [None]:
# surprise! cylinders looks int-like
autos.cylinders.describe()

In [None]:
# opps! missing values
autos.cylinders.value_counts(dropna=False)

In [None]:
# where are they missing?
(autos
  [cols]
  .query('cylinders.isna()')
)

In [None]:
# chaining - add cylinders and displ columns
(autos
 [cols]
 .assign(cylinders=autos.cylinders.fillna(0).astype('int8[pyarrow]'),
         displ=autos.displ.fillna(0))
 .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'int16[pyarrow]', 
          'fuelCost08': 'int16[pyarrow]', 'range': 'int16[pyarrow]', 'year': 'int16[pyarrow]',  })
 .describe()
)

In [None]:
np.iinfo(np.uint16)

In [None]:
# use this to inspect float sizes
np.finfo(np.float16)

In [None]:
# Pyarrow doesn't have float16...
# use this to inspect float sizes
np.finfo(np.float32)

In [None]:
# chaining - convert displ to float32
(autos
 [cols]
 .assign(cylinders=autos.cylinders.fillna(0).astype('int8[pyarrow]'),
         displ=autos.displ.fillna(0).astype('float32[pyarrow]'))
 .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'int16[pyarrow]', 
          'fuelCost08': 'int16[pyarrow]', 'range': 'int16[pyarrow]',  'year': 'int16[pyarrow]'})
)

In [None]:
# new memory usage
(autos
 .loc[:, cols]
 .assign(cylinders=autos.cylinders.fillna(0).astype('int8[pyarrow]'),
         displ=autos.displ.fillna(0).astype('float32[pyarrow]'))
 .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'int16[pyarrow]',
          'fuelCost08': 'int16[pyarrow]', 'range': 'int16[pyarrow]',  'year': 'int16[pyarrow]'})
 .memory_usage(deep=True)
 .sum()  # was 19,647,323
)

## Float Exercise

* Find the `.count` of missing values for the float columns
* Find the Pearson correlation coefficient for float columns

### Objects

In [None]:
# pandas 1.x
(autos
 [cols]
 .select_dtypes(object)
)

In [None]:
# pandas 2
(autos
 [cols]
 .select_dtypes('string')
)

In [None]:
# looks categorical
# Note that missing values are EMPTY strings in Pandas 2.0 but not 2.1
(autos.drive.value_counts(dropna=False))

In [None]:
# where are the values missing for drive?
(autos
 [cols]
 .query('drive.isna()'))

In [None]:
# where are the values missing for drive?
(autos
 [cols]
 .query('drive == ""'))

In [None]:
autos.sample(10)

In [None]:
# drive and make (in .astype) to category
(autos
 [cols]
 .assign(cylinders=autos.cylinders.fillna(0).astype('int8[pyarrow]'),
         displ=autos.displ.fillna(0).astype('float32[pyarrow]'),
         #drive=autos.drive.replace('', 'Other').astype('category')
         drive=autos.drive.fillna('Other').astype('category')
        )
 .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'int16[pyarrow]', 'fuelCost08': 'int16[pyarrow]', 
          'range': 'int16[pyarrow]',  'year': 'int16[pyarrow]', 'make': 'category'})
 .memory_usage(deep=True)
 .sum()  # was 19,647,323
)

In [None]:
# let's inspect trany
# looks like it has two pieces of information embedded in column
(autos.trany.value_counts(dropna=False))

In [None]:
# add automatic, speeds from trany, then drop trany
(autos
 [cols]
 .assign(cylinders=autos.cylinders.fillna(0).astype('int8[pyarrow]'),
         displ=autos.displ.fillna(0).astype('float32[pyarrow]'),
         drive=autos.drive.fillna('Other').astype('category'),
         automatic=autos.trany.str.contains('Auto'),
         # pyarrow doesn't like next line
         speeds=autos.trany.str.extract(r'(\d+)').fillna('20').astype('int8[pyarrow]')
        )
 .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'int16[pyarrow]', 'fuelCost08': 'int16[pyarrow]', 
          'range': 'int16[pyarrow]',  'year': 'int16[pyarrow]', 'make': 'category'})
 .drop(columns=['trany'])
 .memory_usage(deep=True)
 .sum()  # was 19,647,323
)

In [None]:
# add automatic, speeds from trany, then drop trany
(autos
 [cols]
 .assign(cylinders=autos.cylinders.fillna(0).astype('int8[pyarrow]'),
         displ=autos.displ.fillna(0).astype('float32[pyarrow]'),
         drive=autos.drive.fillna('Other').astype('category'),
         automatic=autos.trany.str.contains('Auto'),
         # pyarrow doesn't like next line
         speeds=autos.trany.astype(str).str.extract(r'(\d+)').fillna('20').astype('int8[pyarrow]')
        )
 .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'int16[pyarrow]', 'fuelCost08': 'int16[pyarrow]', 
          'range': 'int16[pyarrow]',  'year': 'int16[pyarrow]', 'make': 'category'})
 .drop(columns=['trany'])
 #.memory_usage(deep=True)
 #.sum()  # was 19,647,323
)

In [None]:
# note that we can add spacing to the the column transformation
# to make it easier to read
(autos
 .trany
 .astype(str)
 .str.extract(r'(\d+)')
 .fillna('20')
 .astype('int8[pyarrow]')
)

## String Exercise
* Find the `.str` attributes of a string column
* Pull up the documentation for a string attribute (`.startswith`)

### Dates

In [None]:
autos.createdOn

In [None]:
pd.to_datetime(autos.createdOn)

In [None]:
# add a format (works for a few...)
pd.to_datetime(autos.createdOn.iloc[:10], format='%a %b %d %H:%M:%S %Z %Y')

In [None]:
pd.to_datetime(autos.createdOn, format='%a %b %d %H:%M:%S %Z %Y')

In [None]:
# Change problematic abbreviatios to offsets
# Also need to convert to UTC (otherwise type is bad)
pd.to_datetime(autos
    .createdOn
    .replace({' EDT': ' -0400', ' EST': ' -0500'}, regex=True),
              format='%a %b %d %H:%M:%S %z %Y', utc=True)

In [None]:
# Change TZ
dates = (pd.to_datetime(autos
    .createdOn
    .replace({' EDT': ' -0400', ' EST': ' -0500'}, regex=True),
              format='%a %b %d %H:%M:%S %z %Y', utc=True)
 .dt.tz_convert('America/New_York')
)

dates

In [None]:
dates.dt.

In [None]:
# add createdOn
(autos
 [cols]
 .assign(cylinders=autos.cylinders.fillna(0).astype('int8[pyarrow]'),
         displ=autos.displ.fillna(0).astype('float32[pyarrow]'),
         drive=autos.drive.fillna('Other').astype('category'),
         automatic=autos.trany.str.contains('Auto'),
         speeds=autos.trany.astype(str).str.extract(r'(\d+)').fillna('20').astype('int8[pyarrow]'),
         createdOn=pd.to_datetime(autos
            .createdOn
            .replace({' EDT': ' -0400', ' EST': ' -0500'}, regex=True),
                      format='%a %b %d %H:%M:%S %z %Y', utc=True)
            .dt.tz_convert('America/New_York')
        )
 .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'int16[pyarrow]', 'fuelCost08': 'int16[pyarrow]', 
          'range': 'int16[pyarrow]',  'year': 'int16[pyarrow]', 'make': 'category'})
 .drop(columns=['trany'])
 .memory_usage(deep=True)
 .sum()  # was 19,647,323
)

In [None]:
autos.eng_dscr.value_counts(dropna=False)

In [None]:
# add ffs (Feedback fuel system), drop eng_descr
(autos
 [cols]
 .assign(cylinders=autos.cylinders.fillna(0).astype('int8[pyarrow]'),
         displ=autos.displ.fillna(0).astype('float32[pyarrow]'),
         drive=autos.drive.fillna('Other').astype('category'),
         automatic=autos.trany.astype(str).str.contains('Auto'),
         speeds=autos.trany.astype(str).str.extract(r'(\d+)').fillna('20').astype('int8[pyarrow]'),
         createdOn=pd.to_datetime(autos
            .createdOn
            .replace({' EDT': ' -0400', ' EST': ' -0500'}, regex=True),
                      format='%a %b %d %H:%M:%S %z %Y', utc=True)
            .dt.tz_convert('America/New_York'),
         ffs=autos.eng_dscr.str.contains('FFS')
        )
 .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'int16[pyarrow]', 'fuelCost08': 'int16[pyarrow]', 
          'range': 'int16[pyarrow]', 'year': 'int16[pyarrow]', 'make': 'category'})
 #.drop(columns=['trany', 'eng_dscr'])
 .loc[:, ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive',
       'fuelCost08', 'make', 'model', 'range', 'createdOn', 'year',
       'automatic', 'speeds', 'ffs']]
 #.columns
 .memory_usage(deep=True)
 .sum()  # was 19,647,323
)

In [None]:
# a glorious function
def tweak_autos(autos):
    cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr', 
        'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']
    return (autos
     [cols]
     .assign(cylinders=autos.cylinders.fillna(0).astype('int8[pyarrow]'),
             displ=autos.displ.fillna(0).astype('float32[pyarrow]'),
             drive=autos.drive.fillna('Other').astype('category'),
             automatic=autos.trany.astype(str).str.contains('Auto'),
             speeds=autos.trany.astype(str).str.extract(r'(\d+)').fillna('20').astype('int8[pyarrow]'),
             createdOn=pd.to_datetime(autos
                .createdOn
                .replace({' EDT': ' -0400', ' EST': ' -0500'}, regex=True),
                          format='%a %b %d %H:%M:%S %z %Y', utc=True)
                .dt.tz_convert('America/New_York'),
             ffs=autos.eng_dscr.str.contains('FFS')
            )
     .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'int16[pyarrow]', 'fuelCost08': 'int16[pyarrow]',
              'range': 'int16[pyarrow]',  'year': 'int16[pyarrow]', 'make': 'category'})
     .loc[:, ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive',
       'fuelCost08', 'make', 'model', 'range', 'createdOn', 'year',
       'automatic', 'speeds', 'ffs']]
    )

tweak_autos(autos)

## Date Exercise
* List the attributes of the `.dt` attribute

## Chain

Chaining is also called "flow" programming. Rather than making intermediate variables, just leverage the fact that most operations return a new object and work on that.

The chain should read like a recipe of ordered steps.

(BTW, this is actually what we did above.)

<div class='alert alert-warning'>
    Hint: Leverage <tt>.pipe</tt> if you can't find a way to chain 😉🐼💪
</div>
    




In [None]:
# show debugging this

def tweak_autos(autos):
    cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr', 
        'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']
    return (autos
     [cols]
     .assign(cylinders=autos.cylinders.fillna(0).astype('int8[pyarrow]'),
             displ=autos.displ.fillna(0).astype('float32[pyarrow]'),
             drive=autos.drive.fillna('Other').astype('category'),
             automatic=autos.trany.astype(str).str.contains('Auto'),
             speeds=autos.trany.astype(str).str.extract(r'(\d+)').fillna('20').astype('int8[pyarrow]'),
             createdOn=pd.to_datetime(autos
                .createdOn
                .replace({' EDT': ' -0400', ' EST': ' -0500'}, regex=True),
                          format='%a %b %d %H:%M:%S %z %Y', utc=True)
                .dt.tz_convert('America/New_York'),
             ffs=autos.eng_dscr.str.contains('FFS').fillna(False)
            )
     .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'int16[pyarrow]', 'fuelCost08': 'int16[pyarrow]', 
              'range': 'int16[pyarrow]',  'year': 'int16[pyarrow]', 'make': 'category', 'model': 'category',
              'automatic': 'category', })
     .loc[:, ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive',
       'fuelCost08', 'make', 'model', 'range', 'createdOn', 'year',
       'automatic', 'speeds', 'ffs']]            
    )

tweak_autos(autos)#.dtypes

In [None]:
# 1.5 Megs!
tweak_autos(autos).memory_usage(deep=True).sum()

In [None]:
# compare chain to this mess
a1 = autos[cols]
cyls = autos.cylinders.fillna(0)
cyls2 = cyls.astype('int8[pyarrow]')
a1['cylinders'] = cyls2
displ = a1.displ
displ2 = displ.fillna(0)
displ3 = displ2.astype('float32[pyarrow]')
a1.displ = displ3
a1.drive = autos.drive.replace('', 'Other').astype('category')
a1['automatic'] = autos.trany.astype(str).str.contains('Auto')           
speed = autos.trany.astype(str).str.extract(r'(\d+)')
speedfill = speed.fillna('20')
speedint = speedfill.astype('int8[pyarrow]')
a1['speeds'] = speedint
a1.createdOn=pd.to_datetime(autos.createdOn).dt.tz_localize('America/New_York')
a1.ffs=autos.eng_dscr.str.contains('FFS')
a1['highway08'] = autos.highway08.astype('int8[pyarrow]')
a1['city08'] = autos.city08.astype('int8[pyarrow]')
a1['comb08'] = autos.comb08.astype('int16[pyarrow]')
a1['fuelCost08'] = autos.fuelCost08.astype('int16[pyarrow]')
a1['range'] = autos.range.astype('int16[pyarrow]')
a1['make'] = autos.make.astype('category')
a3 = a1.drop(columns=['trany', 'eng_dscr'])

In [None]:
###### easy to debug
#  - assign to var (df3)
#  - comment out
#  - pipe to display


from IPython.display import display

def get_var(df, var_name):
    globals()[var_name] = df
    return df

def tweak_autos(autos):
    return (autos
    .pipe(lambda df: print(df.shape) or df)                                    
     [cols]
    .pipe(lambda df: print(df.shape) or df)                        
      # create var                        
     .pipe(get_var, 'df3')
     .assign(cylinders=autos.cylinders.fillna(0).astype('int8[pyarrow]'),
             displ=autos.displ.fillna(0).astype('float32[pyarrow]'),
             drive=autos.drive.fillna('Other').astype('category'),
             automatic=autos.trany.astype(str).str.contains('Auto'),
             speeds=autos.trany.astype(str).str.extract(r'(\d+)').fillna('20').astype('int8[pyarrow]'),        
             createdOn=pd.to_datetime(autos
                .createdOn
                .replace({' EDT': ' -0400', ' EST': ' -0500'}, regex=True),
                          format='%a %b %d %H:%M:%S %z %Y', utc=True)
                .dt.tz_convert('America/New_York'),
             ffs=autos.eng_dscr.str.contains('FFS')
            )
     # debug pipe            
    .pipe(lambda df: print(df.shape) or df)            
     .pipe(lambda df: display(df) or df)
     .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'int16[pyarrow]', 'fuelCost08': 'int16[pyarrow]', 
              'range': 'int16[pyarrow]', 'year': 'int16[pyarrow]', 'make': 'category'})
     .loc[:, ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive',
       'fuelCost08', 'make', 'model', 'range', 'createdOn', 'year',
       'automatic', 'speeds', 'ffs']]            
    )

tweak_autos(autos)

In [None]:
# inspect intermediate data frame
df3

## Chain Exercise
* Make a chain to 
  * Filter F150 models
  * With *highway08* greater than 20
  * Correlate the values
  * Select the *city08*, *year*, *cylinders*, *displ* columns

## More Chain

Chaining w/ Functions enables:

* Testing
* Deployment
* Reuse

## Don't Mutate

> "you are missing the point, inplace rarely actually does something inplace, you are thinking that you are saving memory but you are not."
>
> **jreback** - Pandas core dev



https://github.com/pandas-dev/pandas/issues/16529#issuecomment-676518136

* In general, no performance benefits
* Prohibits chaining
* ``SettingWithCopyWarning`` fun


## Don't Apply (if you can)

In [None]:
def tweak_autos(autos):
    return (autos
     [cols]
     .assign(cylinders=autos.cylinders.fillna(0).astype('int8[pyarrow]'),
             displ=autos.displ.fillna(0).astype('float32[pyarrow]'),
             drive=autos.drive.fillna('Other').astype('category'),
             automatic=autos.trany.astype(str).str.contains('Auto'),
             speeds=autos.trany.astype(str).str.extract(r'(\d+)').fillna('20').astype('int8[pyarrow]'),
             createdOn=pd.to_datetime(autos
                .createdOn
                .replace({' EDT': ' -0400', ' EST': ' -0500'}, regex=True),
                          format='%a %b %d %H:%M:%S %z %Y', utc=True)
                .dt.tz_convert('America/New_York'),
             ffs=autos.eng_dscr.str.contains('FFS')
            )
     .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'int16[pyarrow]', 'fuelCost08': 
              
              'int16[pyarrow]',
              'range': 'int16[pyarrow]',  'year': 'int16[pyarrow]', 'make': 'category'})
     .loc[:, ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive',
       'fuelCost08', 'make', 'model', 'range', 'createdOn', 'year',
       'automatic', 'speeds', 'ffs']]            
    )


autos2 = tweak_autos(autos)

In [None]:
# try to me more Euro-centric
def to_lper100km(val):
    return 235.215 / val
autos2.city08.apply(to_lper100km)

In [None]:
# this gives the sames results
235.215 / autos2.city08 

In [None]:
%%timeit
autos2.city08.apply(to_lper100km)

In [None]:
%%timeit
235.215 / autos2.city08 

In [None]:
# ~50x slower!
6_220 / 110

In [None]:
def is_american(val):
    return val in {'Chevrolet', 'Ford', 'Dodge', 'GMC', 'Tesla'}

In [None]:
%%timeit
autos2.make.apply(is_american)

In [None]:
%%timeit
autos2.make.isin({'Chevrolet', 'Ford', 'Dodge', 'GMC', 'Tesla'})

In [None]:
autos3 = autos2.assign(make=autos2.make.astype(str))

In [None]:
%%timeit
# converted to string
autos3.make.isin({'Chevrolet', 'Ford', 'Dodge', 'GMC', 'Tesla'})

In [None]:
%%timeit
autos3.make.apply(is_american)

In [None]:
def country(val):
    if val in {'Chevrolet', 'Ford', 'Dodge', 'GMC', 'Tesla'}:
        return 'US'
    return 'Other'

In [None]:
%%timeit
# Might be ok for strings, since they are not vectorized...
(autos2
 .assign(country=autos2.make.apply(country))
)

In [None]:
%%timeit
values = {'Chevrolet', 'Ford', 'Dodge', 'GMC', 'Tesla'}
(autos2
 .assign(country='US')
 .assign(country=lambda df_:df_.country.where(df_.make.isin(values), 'Other'))
)

In [None]:
%%timeit

(autos2
 .assign(country=np.select([autos2.make.isin({'Chevrolet', 'Ford', 'Dodge', 'GMC', 'Tesla'})], 
                           ['US'], 'Other'))
)

In [None]:
%%timeit

(autos2
 .assign(country=np.where(autos2.make.isin({'Chevrolet', 'Ford', 'Dodge', 'GMC', 'Tesla'}), 
                          'US', 'Other'))
)

## Apply Exercise
* Create a column, *mycomb*, that is the mean of *city08* and *highway08*

## Master Aggregation

Let's compare mileage by country by year...🤔

In [None]:
(autos2
   .groupby('year')
   .mean()
)

In [None]:
(autos2
   .groupby('year')
   .mean(numeric_only=True)
)

In [None]:
# watch order of column filtering/aggregation
(autos2
   .groupby('year')
   [['comb08', 'speeds']]
   .mean()
)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('pandas1book') 
sns.set_context('talk')
plt.plot(range(10))

In [None]:
(autos2
   .groupby('year')
   [['comb08', 'speeds']]
   .mean()
   .plot()
)

In [None]:
(autos2
   .groupby('year')
   [['comb08', 'speeds', 'highway08']]
   #.mean()
   #.median()
   .quantile(.3)
   #.std()
   #.var()
   .plot()
)

In [None]:
# add country
(autos2
 .assign(country=autos2.make.apply(country))
 .groupby(['year', 'country'])
 [['comb08', 'speeds']]
 .mean()
)

In [None]:
# can go deeper and apply multiple aggregates
def second_to_last(ser):
    return ser.iloc[-2]

(autos2
 .assign(country=autos2.make.apply(country))
 .groupby(['year', 'country'])
 [['comb08', 'speeds']]
 .agg(['min', 'mean', second_to_last])
)

In [None]:
# back to simpler example, adding plots
(autos2
 .assign(country=autos2.make.apply(country))
 .groupby(['year', 'country'])
 [['comb08', 'speeds']]
 .mean()
 #.plot()
)

In [None]:
(autos2
 .assign(country=autos2.make.apply(country))
 .groupby(['year', 'country'])
 [['comb08', 'speeds']]
 .mean()
 .unstack()
)

In [None]:
(autos2
 .assign(country=autos2.make.apply(country))
 .groupby(['year', 'country'])
 [['city08', 'speeds']]
 .mean()
 #.std()
 .unstack()
 .city08
 .plot()
 .legend(bbox_to_anchor=(1,1))
)

In [None]:
# smoothe it out a bit w/ rolling
(autos2
 .assign(country=autos2.make.apply(country))
 .groupby(['year', 'country'])
 [['city08', 'speeds']]
 .mean()
 .unstack()
 .city08
 .rolling(3)
 .mean()
 .plot()
 .legend(bbox_to_anchor=(1,1))
)

In [None]:
# One more example of cleaning up with Matplotlib
makes = ['Tesla', 'Honda', 'Toyota', 'Ford']
(autos
 .query('make.isin(@makes)')
 ##.loc[autos.make.isin(makes)]
 .groupby(['year', 'make'])
 .city08
 .mean()
 .unstack()
 .loc[:, makes]
 .plot()
)

## Aggregation Exercise
* Find the median *city08* by *make*
* Plot the above
* Filter out "Toyota", "Honda", "Chevrolet", and "Porsche" and find the median *city08* by *make*
* Plot the above
* Find the max, mean, and min *city08* by decade

In [None]:
(autos
 .assign(decade=autos.year//10)
 .groupby('decade')
 .city08
 .agg(['min', 'mean', 'max'])
.plot()
)

## Example of Fancy Plot

In [None]:
sns.reset_defaults()
makes = ['Tesla', 'Honda', 'Toyota', 'Ford']
colors = ['#e3120b', '#76725e', '#b3b09e', '#d2d0c4']
fig, ax = plt.subplots(figsize=(6,4), dpi=100)
ax = (autos
 #.query('make.isin(@makes)')
 .loc[autos.make.isin(makes)]
 .groupby(['year', 'make'])
 .city08
 .mean()
 .unstack()
 .loc[:, makes]
 .plot(color=colors, legend=False, linewidth=3, ax=ax)
)
plt.rcParams["font.family"] = "Roboto"
plt.grid(axis='y')
plt.suptitle('Annual City Mileage', ha='left', x=.12)
[ax.spines[side].set_visible(False) for side in ['top', 'left', 'right']]
ax.tick_params(left=False) # hide ticks
ax.set_xlabel('') # clear x label
ax.set_xticks(minor=True, ticks=range(1984,2020))
# set positions and labels for major ticks
ax.set_xticks(ticks=range(1985,2019,5))
ax.set_xticklabels(['1985', '90', '95', '2000', '05', '10', '2015'])
ax.set_yticks(ticks=range(0,121,20))
ax.set_yticklabels([]) # hide left hand side
for label in range(20,121,20): # my own vertically shifted y-labels
    ax.text(2022, label+3, f'{label}', color=colors[-3], ha='right')
for label, pos, color in zip(makes, [(2015, 110), # label makes
                              (2010, 33),
                              (1986, 20.5),
                              (1995, 10),
                             ], colors):
    ax.text(*pos, label, color=color, ha='left')
_ = ax.text(1982, -20, 'Source: fueleconomy.gov', ha='left')

In [None]:
fig

In [None]:
# a glorious function
def tweak_autos(autos):
    cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr', 
        'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']
    return (autos
     [cols]
     .assign(cylinders=autos.cylinders.fillna(0).astype('int8[pyarrow]'),
             displ=autos.displ.fillna(0).astype('float32[pyarrow]'),
             drive=autos.drive.fillna('Other').astype('category'),
             automatic=autos.trany.astype(str).str.contains('Auto'),
             speeds=autos.trany.astype(str).str.extract(r'(\d+)').fillna('20').astype('int8[pyarrow]'),
             createdOn=pd.to_datetime(autos
                .createdOn
                .replace({' EDT': ' -0400', ' EST': ' -0500'}, regex=True),
                          format='%a %b %d %H:%M:%S %z %Y', utc=True)
                .dt.tz_convert('America/New_York'),
             ffs=autos.eng_dscr.str.contains('FFS')
            )
     .astype({'highway08': 'int8[pyarrow]', 'city08': 'int16[pyarrow]', 'comb08': 'int16[pyarrow]', 'fuelCost08': 'int16[pyarrow]',
              'range': 'int16[pyarrow]',  'year': 'int16[pyarrow]', 'make': 'category'})
     .loc[:, ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive',
       'fuelCost08', 'make', 'model', 'range', 'createdOn', 'year',
       'automatic', 'speeds', 'ffs']]
    )

tweak_autos(autos)

## Summary

* Correct types save space and enable convenient math, string, and date functionality
* Chaining operations will:
   * Make code readable
   * Remove bugs
   * Easier to debug
* Don't mutate (there's no point). Embrace chaining.
* ``.apply`` is slow for math
* Aggregations are powerful. Play with them until they make sense
* Upcoming courses
* https://store.metasnake.com

Follow me on Twitter ``@__mharrison__``

Book giveaway!


In [None]:
import random
random.randrange(1,11)