# Python Pandas 13: Crosstab
    
https://www.youtube.com/watch?v=Fo0IMzfcnQE&list=PLeo1K3hjS3uuASpe-1LjfG5f14Bnozjwy&index=16
   
Contingency or Cross Tabulation

A type of table in a matrix format that displays the (multivariate) frequency distribution of the variables.

## Functions Covered


### index=Nationality, columns=Handedness, with 'All' column and row
pd.crosstab(df['Nationality'], df['Handedness'], margins=True)


### index=Sex, columns=Handedness, with 'All' column and row
pd.crosstab(df['Sex'], df['Handedness'], margins=True)

### index=Sex, Age, columns=Handedness, Nationality, with 'All' column and row
pd.crosstab( [df['Sex'], df['Age']], [df['Handedness'], df['Nationality']], margins=True)
    
        
### index=Sex, columns=Handedness, generate results as percentages of rows
pd.crosstab( df['Sex'], df['Handedness'], normalize='index')

### index=Sex, columns=Handedness, generate results as percentages of columns
pd.crosstab( df['Sex'], df['Handedness'], normalize='columns')

### index=Sex, columns=Handedness, generate results as percentages of all values
pd.crosstab( df['Sex'], df['Handedness'], normalize='all')

### Calculate the average age of females who are left handed and males who are right handed
pd.crosstab( df['Sex'], df['Handedness'], values=df['Age'], aggfunc = np.mean)

### Using Crosstab with np.array objects
pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])<br/> 
foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])<br/> 
bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])<br/> 
pd.crosstab(foo, bar)<br/> 
pd.crosstab(foo, bar, dropna=False)<br/> 

In [23]:
import pandas as pd

df = pd.DataFrame({
    'Name' : [
        'Kathy', 'Linda', 'Peter', 'John',
        'Fatima', 'Kadir', 'Dhaval', 'Sudhir',
        'Parvir', 'Yan', 'Juan', 'Liang'],

    'Nationality' : [
        'USA', 'USA', 'USA', 'USA', 
        'Bangadesh', 'Bangadesh', 'India', 'India',
        'India', 'China', 'China', 'China'],
    
    'Sex' : [
        'Female', 'Female', 'Male', 'Male',
        'Female', 'Male', 'Male', 'Male',
        'Male', 'Female', 'Female', 'Male'],
    
    'Age' : [
        23, 18, 19, 22,
        31, 25, 35, 31,
        37, 52, 58, 43],
    
    'Handedness' : [
        'Right', 'Right', 'Right', 'Left',
        'Left', 'Left', 'Left', 'Left', 
        'Right', 'Right', 'Left', 'Left'] })

df

Unnamed: 0,Name,Nationality,Sex,Age,Handedness
0,Kathy,USA,Female,23,Right
1,Linda,USA,Female,18,Right
2,Peter,USA,Male,19,Right
3,John,USA,Male,22,Left
4,Fatima,Bangadesh,Female,31,Left
5,Kadir,Bangadesh,Male,25,Left
6,Dhaval,India,Male,35,Left
7,Sudhir,India,Male,31,Left
8,Parvir,India,Male,37,Right
9,Yan,China,Female,52,Right


Desired Contingency Table (or Crosstabs)


| Handedness  | Left | Right |
|-------------|------|-------|
| Nationality |      |       |
| USA         | 1    | 3     |
| Bangladesh  | 2    | 0     |
| India       | 2    | 1     |
| China       | 2    | 1     |

| Handedness | Left | Right |
|------------|------|-------|
| Sex        |      |       |
| -----------|------|-------|
| Female     | 2    | 3     |
| -----------|------|-------|
| Male       | 5    | 2     |
|------------|------|-------|


### index=Nationality, columns=Handedness, with 'All' column and row
'All' column and row are added because:  margins=True

In [6]:
pd.crosstab(df['Nationality'], df['Handedness'], margins=True)

Handedness,Left,Right,All
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bangadesh,2,0,2
China,2,1,3
India,2,1,3
USA,1,3,4
All,7,5,12


### index=Sex, columns=Handedness, with 'All' column and row
'All' column and row are added because:  margins=True

In [5]:
pd.crosstab(df['Sex'], df['Handedness'], margins=True)

Handedness,Left,Right,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,2,3,5
Male,5,2,7
All,7,5,12


### index=Sex, Age, columns=Handedness, Nationality, with 'All' column and row
'All' column and row are added because:  margins=True

In [35]:
pd.crosstab( [df['Sex'], df['Age']], [df['Handedness'], df['Nationality']], margins=True)

Unnamed: 0_level_0,Handedness,Left,Left,Left,Left,Right,Right,Right,All
Unnamed: 0_level_1,Nationality,Bangadesh,China,India,USA,China,India,USA,Unnamed: 9_level_1
Sex,Age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Female,18.0,0,0,0,0,0,0,1,1
Female,23.0,0,0,0,0,0,0,1,1
Female,31.0,1,0,0,0,0,0,0,1
Female,52.0,0,0,0,0,1,0,0,1
Female,58.0,0,1,0,0,0,0,0,1
Male,19.0,0,0,0,0,0,0,1,1
Male,22.0,0,0,0,1,0,0,0,1
Male,25.0,1,0,0,0,0,0,0,1
Male,31.0,0,0,1,0,0,0,0,1
Male,35.0,0,0,1,0,0,0,0,1


### index=Sex, columns=Handedness, generate results as percentages of rows
Percentages are calculated because: normalize='index' which means normalize based on rows

In [30]:
pd.crosstab( df['Sex'], df['Handedness'], normalize='index')

Handedness,Left,Right
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.4,0.6
Male,0.714286,0.285714


### index=Sex, columns=Handedness, generate results as percentages of columns
Percentages are calculated because: normalize='columns' which means normalize based on columns

In [32]:
pd.crosstab( df['Sex'], df['Handedness'], normalize='columns')

Handedness,Left,Right
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.285714,0.6
Male,0.714286,0.4


### index=Sex, columns=Handedness, generate results as percentages of all values
Percentages are calculated because: normalize='all' which means normalize based on all values

In [33]:
pd.crosstab(df['Sex'], df['Handedness'], normalize='all')

Handedness,Left,Right
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.166667,0.25
Male,0.416667,0.166667


### Calculate the average age of females who are left handed and males who are right handed


In [34]:
import numpy as np

pd.crosstab( df['Sex'], df['Handedness'], values=df['Age'], aggfunc = np.mean)

Handedness,Left,Right
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,44.5,31.0
Male,31.2,28.0


### Using Crosstab with np.array objects

In [38]:
a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
              "bar", "bar", "foo", "foo", "foo"], dtype=object)

b = np.array(["one", "one", "one", "two", "one", "one",
              "one", "two", "two", "two", "one"], dtype=object)

c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
              "shiny", "dull", "shiny", "shiny", "shiny"],
             dtype=object)

pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])

b,one,one,two,two
c,dull,shiny,dull,shiny
a,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,1,2,1,0
foo,2,2,1,2


In [40]:
# Here 'c' and 'f' are not represented in the data and will not be
# shown in the output because dropna is True by default. Set
# dropna=False to preserve categories with no data.

foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
pd.crosstab(foo, bar)

col_0,d,e
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1,0
b,0,1


In [44]:
pd.crosstab(foo, bar, dropna=False)

col_0,d,e,f
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,0
b,0,1,0
c,0,0,0
