### Pandas - pivot_table, crosstab

In [None]:
import pandas as pd
import numpy  as np
import seaborn as sns

pd.set_option('precision', 4)

In [None]:
df = pd.DataFrame(
    {"A": ["Alice", "Alice", "Alice", "Alice", "Alice",
           "Bob", "Bob", "Bob", "Bob"],
     "B": ["one", "one", "one", "two", "two",
           "one", "one", "two", "two"],
     "C": ["small", "large", "large", "small",
           "small", "large", "small", "small", "large"],
     "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
     "E": [10,20,30,40,50,60,70,80,90]
     })

df


- index - Keys to group by on the pivot table index
- default aggregation function - group means

In [None]:
df.pivot_table(index=['A'])

In [None]:
df.pivot_table(index=['A'], aggfunc=np.sum)

In [None]:
df.pivot_table(index=['A'], aggfunc=len)

- values- specify column(s) to aggregate using values

In [None]:
df.pivot_table(index=['A'], values='D', 
               aggfunc=np.sum)

In [None]:
df.pivot_table(index=['A'], values=['D','E'], 
               aggfunc=np.sum)

In [None]:
df.dtypes

In [None]:
df

- columns - Specify keys to group by on the pivot table column

In [None]:
df.pivot_table(index=['A'], columns=['C'], 
               aggfunc=np.sum)

df.pivot_table(index=['A'], values='D', 
               columns=['C'], aggfunc=np.sum)

- If dict is passed for aggfunc, the key is column to aggregate and value is function or list of functions

In [None]:
df.pivot_table(index=['A'], columns=['C'], 
               aggfunc={'D': 'sum', 'E': 'mean'})

In [None]:
df.pivot_table(index=['A'], columns=['C'], 
               aggfunc={'D': ['sum', np.mean], 
                        'E': ['count', len]})

### multi-index

In [None]:
df

- array of index values

In [None]:
df.pivot_table(index=['A', 'B'])

In [None]:
df.pivot_table(index=['A', 'B'], aggfunc=np.sum)

In [None]:
df.pivot_table(index=['A', 'B'], columns = ['C'], 
               aggfunc=np.sum)

In [None]:
df.pivot_table(index=['A', 'B'], columns = ['C'], 
               aggfunc=np.sum).index

In [None]:
df.pivot_table(index=['A', 'B'], columns = ['C'], 
               aggfunc=np.sum).columns

In [None]:
df.pivot_table(index=['A', 'B'], values=['D'], 
               columns=['C'], aggfunc=np.sum)

### Margins

In [None]:
df.pivot_table(index=['A', 'B'], values=['D'], 
               columns=['C'], 
               aggfunc=np.sum, margins=True)

In [None]:
df.pivot_table(index=['A', 'B'], values=['D'], 
               columns=['C'], 
               aggfunc=len, margins=True, 
               fill_value=0)

## Tips dataset

In [None]:
tips = sns.load_dataset("tips")
tips.head()

In [None]:
# group means - default pivot_table aggregation type

tips.pivot_table(index=['day', 'sex'])

In [None]:
tips.pivot_table(index=['day', 'sex'], 
                 margins = True)

In [None]:
tips.pivot_table(index=['day', 'sex'], values='total_bill',
                 margins = True)

In [None]:
tips['total_bill'].mean()

In [None]:
tips.pivot_table(index=['day', 'sex'], aggfunc = np.sum)

In [None]:
tips.pivot_table(index=['day', 'sex'], aggfunc = np.sum, 
                 margins=True)

### crosstab(...)
 - Cross tabulation of two or more factors
 - Default - frequency table

In [None]:
df

In [None]:
pd.crosstab(df.A, df.C )

- normalize - for percentages rather than counts
- if passed ‘all’ or True, will normalize over all values

In [None]:
pd.crosstab(df.A, df.C, normalize=True)

In [None]:
# normalize over each row

pd.crosstab(df.A, df.C, normalize='index')

In [None]:
# normalize over each column

pd.crosstab(df.A, df.C, normalize='columns')

In [None]:
pd.crosstab(df.A, df.B)

In [None]:
# With and third series and an aggregation function

pd.crosstab(df.A, df.B, values=df.E, aggfunc=np.sum)

## Tips dataset

In [None]:
tips.head()

In [None]:
pd.crosstab(tips.day, tips.time)

In [None]:
# equivalent groupby

tips.groupby(['day', 'time'])['day'].count().unstack().fillna(0)

In [None]:
# Without unstack

tips.groupby(['day', 'time'])['day'].count().fillna(0)

In [None]:
# Same as

tips.groupby(['day', 'time'])['time'].count().unstack().fillna(0)

In [None]:
# equivalent pivot_table

tips.pivot_table(index='day', columns='time', 
                 aggfunc={'time':len}, fill_value=0)

In [None]:
# Margin totals

pd.crosstab(tips.day, tips.time, margins=True, 
            margins_name="Total")

In [None]:
# Summarization with crosstab

pd.crosstab(tips.day, tips.time, 
            values=tips.tip, 
            aggfunc=np.sum)

In [None]:
# Margin totals with values and aggfunc

pd.crosstab(tips.day, tips.time,
            values = tips.total_bill,
            aggfunc = np.sum,
            margins=True, margins_name="Total")