### Pandas - pivot_table, crosstab

In [1]:
import pandas as pd
import numpy  as np
import seaborn as sns

pd.set_option('precision', 4)

## Functions Covered

### Pivot_Table

df.pivot_table(index=['A'], aggfunc=np.sum)

df.pivot_table(index=['A'], values=['D','E'], aggfunc=np.sum)

df.pivot_table(index=['A'], columns=['C'], aggfunc={'D': ['sum', np.mean], 'E': ['count', len]})

df.pivot_table(index=['A', 'B'], aggfunc=np.sum)

df.pivot_table(index=['A', 'B'], values=['D'], columns=['C'], aggfunc=np.sum, margins=True)

### CrossTab

 - Compute a simple cross-tabulation of two (or more) factors.
 - By default computes a frequency table of the factors unless an array of values and an aggregation function are passed.

pd.crosstab(df.A, df.B, values=df.E, aggfunc=np.sum)


In [2]:
df = pd.DataFrame(
    {"A": ["Alice", "Alice", "Alice", "Alice", "Alice",
           "Bob", "Bob", "Bob", "Bob"],
     "B": ["one", "one", "one", "two", "two",
           "one", "one", "two", "two"],
     "C": ["small", "large", "large", "small",
           "small", "large", "small", "small", "large"],
     "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
     "E": [10,20,30,40,50,60,70,80,90]
     })

df


Unnamed: 0,A,B,C,D,E
0,Alice,one,small,1,10
1,Alice,one,large,2,20
2,Alice,one,large,2,30
3,Alice,two,small,3,40
4,Alice,two,small,3,50
5,Bob,one,large,4,60
6,Bob,one,small,5,70
7,Bob,two,small,6,80
8,Bob,two,large,7,90


- index - Keys to group by on the pivot table index
- default aggregation function - group means

In [3]:
df.pivot_table(index=['A'])

Unnamed: 0_level_0,D,E
A,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,2.2,30
Bob,5.5,75


In [4]:
df.pivot_table(index=['A'], aggfunc=np.sum)

Unnamed: 0_level_0,D,E
A,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,11,150
Bob,22,300


In [5]:
df.pivot_table(index=['A'], aggfunc=len)

Unnamed: 0_level_0,B,C,D,E
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alice,5,5,5,5
Bob,4,4,4,4


- values- specify column(s) to aggregate using values

In [6]:
df.pivot_table(index=['A'], values='D', 
               aggfunc=np.sum)

Unnamed: 0_level_0,D
A,Unnamed: 1_level_1
Alice,11
Bob,22


In [7]:
df.pivot_table(index=['A'], values=['D','E'], 
               aggfunc=np.sum)

Unnamed: 0_level_0,D,E
A,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,11,150
Bob,22,300


In [8]:
df.dtypes

A    object
B    object
C    object
D     int64
E     int64
dtype: object

In [9]:
df

Unnamed: 0,A,B,C,D,E
0,Alice,one,small,1,10
1,Alice,one,large,2,20
2,Alice,one,large,2,30
3,Alice,two,small,3,40
4,Alice,two,small,3,50
5,Bob,one,large,4,60
6,Bob,one,small,5,70
7,Bob,two,small,6,80
8,Bob,two,large,7,90


- columns - Specify keys to group by on the pivot table column

In [10]:
df.pivot_table(index=['A'], columns=['C'], 
               aggfunc=np.sum)

Unnamed: 0_level_0,D,D,E,E
C,large,small,large,small
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Alice,4,7,50,100
Bob,11,11,150,150


df.pivot_table(index=['A'], values='D', 
               columns=['C'], aggfunc=np.sum)

- If dict is passed for aggfunc, the key is column to aggregate and value is function or list of functions

In [11]:
df.pivot_table(index=['A'], columns=['C'], aggfunc={'D': 'sum', 'E': 'mean'})

Unnamed: 0_level_0,D,D,E,E
C,large,small,large,small
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Alice,4,7,25.0,33.3333
Bob,11,11,75.0,75.0


In [12]:
df.pivot_table(index=['A'], columns=['C'], aggfunc={'D': ['sum', np.mean], 'E': ['count', len]})

Unnamed: 0_level_0,D,D,D,D,E,E,E,E
Unnamed: 0_level_1,mean,mean,sum,sum,count,count,len,len
C,large,small,large,small,large,small,large,small
A,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Alice,2.0,2.3333,4.0,7.0,2,3,2,3
Bob,5.5,5.5,11.0,11.0,2,2,2,2


### multi-index

In [13]:
df

Unnamed: 0,A,B,C,D,E
0,Alice,one,small,1,10
1,Alice,one,large,2,20
2,Alice,one,large,2,30
3,Alice,two,small,3,40
4,Alice,two,small,3,50
5,Bob,one,large,4,60
6,Bob,one,small,5,70
7,Bob,two,small,6,80
8,Bob,two,large,7,90


- array of index values

In [14]:
df.pivot_table(index=['A', 'B'])

Unnamed: 0_level_0,Unnamed: 1_level_0,D,E
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
Alice,one,1.6667,20
Alice,two,3.0,45
Bob,one,4.5,65
Bob,two,6.5,85


In [15]:
df.pivot_table(index=['A', 'B'], aggfunc=np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,D,E
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
Alice,one,5,60
Alice,two,6,90
Bob,one,9,130
Bob,two,13,170


In [16]:
df

Unnamed: 0,A,B,C,D,E
0,Alice,one,small,1,10
1,Alice,one,large,2,20
2,Alice,one,large,2,30
3,Alice,two,small,3,40
4,Alice,two,small,3,50
5,Bob,one,large,4,60
6,Bob,one,small,5,70
7,Bob,two,small,6,80
8,Bob,two,large,7,90


In [17]:
df.pivot_table(index=['A', 'B'], columns = ['C'], 
               aggfunc=np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D,E,E
Unnamed: 0_level_1,C,large,small,large,small
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Alice,one,4.0,1.0,50.0,10.0
Alice,two,,6.0,,90.0
Bob,one,4.0,5.0,60.0,70.0
Bob,two,7.0,6.0,90.0,80.0


In [18]:
df.pivot_table(index=['A', 'B'], columns = ['C'], 
               aggfunc=np.sum).index

MultiIndex([('Alice', 'one'),
            ('Alice', 'two'),
            (  'Bob', 'one'),
            (  'Bob', 'two')],
           names=['A', 'B'])

In [19]:
df.pivot_table(index=['A', 'B'], columns = ['C'], 
               aggfunc=np.sum).columns

MultiIndex([('D', 'large'),
            ('D', 'small'),
            ('E', 'large'),
            ('E', 'small')],
           names=[None, 'C'])

In [20]:
df.pivot_table(index=['A', 'B'], values=['D'], 
               columns=['C'], aggfunc=np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D
Unnamed: 0_level_1,C,large,small
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2
Alice,one,4.0,1.0
Alice,two,,6.0
Bob,one,4.0,5.0
Bob,two,7.0,6.0


### Margins

In [21]:
df.pivot_table(index=['A', 'B'], values=['D'], 
               columns=['C'], 
               aggfunc=np.sum, margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D,D
Unnamed: 0_level_1,C,large,small,All
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Alice,one,4.0,1.0,5
Alice,two,,6.0,6
Bob,one,4.0,5.0,9
Bob,two,7.0,6.0,13
All,,15.0,18.0,33


In [22]:
df.pivot_table(index=['A', 'B'], values=['D'], 
               columns=['C'], 
               aggfunc=len, margins=True, 
               fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D,D
Unnamed: 0_level_1,C,large,small,All
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Alice,one,2,1,3
Alice,two,0,2,2
Bob,one,1,1,2
Bob,two,1,1,2
All,,4,5,9


## Tips dataset

In [23]:
tips = sns.load_dataset("tips")
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [24]:
print(type(np.float64(tips['size']).mean()))
np.float64(tips['size']).mean()

<class 'numpy.float64'>


2.569672131147541

In [25]:
# group means - default pivot_table aggregation type

tips.pivot_table(index=['day', 'sex'])

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,total_bill
day,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Thur,Male,2.4333,2.9803,18.7147
Thur,Female,2.4688,2.5756,16.7153
Fri,Male,2.1,2.693,19.857
Fri,Female,2.1111,2.7811,14.1456
Sat,Male,2.6441,3.0839,20.8025
Sat,Female,2.25,2.8018,19.6804
Sun,Male,2.8103,3.2203,21.8872
Sun,Female,2.9444,3.3672,19.8722


In [26]:
tips.pivot_table(index=['day', 'sex'], 
                 margins = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,total_bill
day,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Thur,Male,2.4333,2.9803,18.7147
Thur,Female,2.4688,2.5756,16.7153
Fri,Male,2.1,2.693,19.857
Fri,Female,2.1111,2.7811,14.1456
Sat,Male,2.6441,3.0839,20.8025
Sat,Female,2.25,2.8018,19.6804
Sun,Male,2.8103,3.2203,21.8872
Sun,Female,2.9444,3.3672,19.8722
All,,2.5697,2.9983,19.7859


In [27]:
tips.pivot_table(index=['day', 'sex'], values='total_bill',
                 margins = True)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill
day,sex,Unnamed: 2_level_1
Thur,Male,18.7147
Thur,Female,16.7153
Fri,Male,19.857
Fri,Female,14.1456
Sat,Male,20.8025
Sat,Female,19.6804
Sun,Male,21.8872
Sun,Female,19.8722
All,,19.7859


In [28]:
tips['total_bill'].mean()

19.78594262295082

In [29]:
tips.pivot_table(index=['day', 'sex'], aggfunc = np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,total_bill
day,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Thur,Male,73,89.41,561.44
Thur,Female,79,82.42,534.89
Fri,Male,21,26.93,198.57
Fri,Female,19,25.03,127.31
Sat,Male,156,181.95,1227.35
Sat,Female,63,78.45,551.05
Sun,Male,163,186.78,1269.46
Sun,Female,53,60.61,357.7


In [30]:
tips.pivot_table(index=['day', 'sex'], aggfunc = np.sum, 
                 margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,total_bill
day,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Thur,Male,73,89.41,561.44
Thur,Female,79,82.42,534.89
Fri,Male,21,26.93,198.57
Fri,Female,19,25.03,127.31
Sat,Male,156,181.95,1227.35
Sat,Female,63,78.45,551.05
Sun,Male,163,186.78,1269.46
Sun,Female,53,60.61,357.7
All,,627,731.58,4827.77


### crosstab(...)
 - Cross tabulation of two or more factors
 - Default - frequency table  i.e. count

In [31]:
df

Unnamed: 0,A,B,C,D,E
0,Alice,one,small,1,10
1,Alice,one,large,2,20
2,Alice,one,large,2,30
3,Alice,two,small,3,40
4,Alice,two,small,3,50
5,Bob,one,large,4,60
6,Bob,one,small,5,70
7,Bob,two,small,6,80
8,Bob,two,large,7,90


In [32]:
pd.crosstab(df.A, df.C )  # count items in A and C and produce results as a data frame

C,large,small
A,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,2,3
Bob,2,2


- normalize - for percentages rather than counts
- if passed ‘all’ or True, will normalize over all values

In [33]:
pd.crosstab(df.A, df.C, normalize=True)

C,large,small
A,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,0.2222,0.3333
Bob,0.2222,0.2222


In [34]:
# normalize over each row

pd.crosstab(df.A, df.C, normalize='index')

C,large,small
A,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,0.4,0.6
Bob,0.5,0.5


In [35]:
# normalize over each column

pd.crosstab(df.A, df.C, normalize='columns')

C,large,small
A,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,0.5,0.6
Bob,0.5,0.4


In [36]:
pd.crosstab(df.A, df.B)

B,one,two
A,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,3,2
Bob,2,2


In [37]:
# With and third series and an aggregation function

pd.crosstab(df.A, df.B, values=df.E, aggfunc=np.sum)

B,one,two
A,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,60,90
Bob,130,170


## Tips dataset

In [38]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [39]:
pd.crosstab(tips.day, tips.time)

time,Lunch,Dinner
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Thur,61,1
Fri,7,12
Sat,0,87
Sun,0,76


In [40]:
# equivalent groupby

tips.groupby(['day', 'time'])['day'].count().unstack().fillna(0)

time,Lunch,Dinner
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Thur,61.0,1.0
Fri,7.0,12.0
Sat,0.0,87.0
Sun,0.0,76.0


In [41]:
# Without unstack, results are in a series

tips.groupby(['day', 'time'])['day'].count().fillna(0)

day   time  
Thur  Lunch     61
      Dinner     1
Fri   Lunch      7
      Dinner    12
Sat   Dinner    87
Sun   Dinner    76
Name: day, dtype: int64

In [42]:
# Same as, results are in a dataframe

tips.groupby(['day', 'time'])['time'].count().unstack().fillna(0)

time,Lunch,Dinner
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Thur,61.0,1.0
Fri,7.0,12.0
Sat,0.0,87.0
Sun,0.0,76.0


In [43]:
# equivalent pivot_table

tips.pivot_table(index='day', columns='time', 
                 aggfunc={'time':len}, fill_value=0)

Unnamed: 0_level_0,time,time
time,Lunch,Dinner
day,Unnamed: 1_level_2,Unnamed: 2_level_2
Thur,61,1
Fri,7,12
Sat,0,87
Sun,0,76


In [44]:
# Margin totals

pd.crosstab(tips.day, tips.time, margins=True, 
            margins_name="Total")

time,Lunch,Dinner,Total
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Thur,61,1,62
Fri,7,12,19
Sat,0,87,87
Sun,0,76,76
Total,68,176,244


In [45]:
# Summarization with crosstab

pd.crosstab(tips.day, tips.time, 
            values=tips.tip, 
            aggfunc=np.sum)

time,Lunch,Dinner
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Thur,168.83,3.0
Fri,16.68,35.28
Sat,,260.4
Sun,,247.39


In [46]:
# Margin totals with values and aggfunc

pd.crosstab(tips.day, tips.time,
            values = tips.total_bill,
            aggfunc = np.sum,
            margins=True, margins_name="Total")

time,Lunch,Dinner,Total
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Thur,1077.55,18.78,1096.33
Fri,89.92,235.96,325.88
Sat,,1778.4,1778.4
Sun,,1627.16,1627.16
Total,1167.47,3660.3,4827.77
