In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
df = sns.load_dataset("titanic")

df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


### Filtering

In [None]:
df.query('(pclass == 3 and embarked == "S" and sex=="male") or (pclass==2 and embarked == "C" and sex=="female")').head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
12,0,3,male,20.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
df.query('age in [0,10]')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
419,0,3,female,10.0,0,2,24.15,S,Third,child,False,,Southampton,no,False
819,0,3,male,10.0,3,2,27.9,S,Third,child,False,,Southampton,no,False


In [None]:
df.deck.notnull().sum()

203

In [None]:
df.query('deck == deck').shape

(203, 15)

### Aggregations

In [None]:
df.groupby('sex').\
    agg(
    min_fare=pd.NamedAgg(column='fare', aggfunc='min'),
    max_age=('age', 'max'),
    average_fare=pd.NamedAgg(column='fare', aggfunc=np.mean),
        )

Unnamed: 0_level_0,min_fare,max_age,average_fare
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,6.75,63.0,44.479818
male,0.0,80.0,25.523893


In [None]:
df.groupby('sex').\
    agg({'age':['mean', 'max','min','std','skew'],
         'fare':['first','last', 'size', lambda g: df.loc[g.index].embark_town.nunique()],
        })

Unnamed: 0_level_0,age,age,age,age,age,fare,fare,fare,fare
Unnamed: 0_level_1,mean,max,min,std,skew,first,last,size,<lambda_0>
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
female,27.915709,63.0,0.75,14.110146,0.206097,71.2833,23.45,314,3.0
male,30.726645,80.0,0.42,14.678201,0.475318,7.25,7.75,577,3.0


In [None]:
df.groupby('sex').\
    agg({'age' :['mean', 'max','min','std','skew'],
         'fare':['mean',
                 lambda g: df.loc[g.index].age.mean() 
                     if df.loc[g.index].sex.all() == 'male'
                     else 0],
        })

Unnamed: 0_level_0,age,age,age,age,age,fare,fare
Unnamed: 0_level_1,mean,max,min,std,skew,mean,<lambda_0>
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,27.915709,63.0,0.75,14.110146,0.206097,44.479818,0.0
male,30.726645,80.0,0.42,14.678201,0.475318,25.523893,30.726645


In [None]:
# https://stackoverflow.com/questions/51631096/python-pandas-aggregation-with-condition

def my_aggregations(group,isex='male',itown='Southampton'):
    
    a = group['sex'].size
    b = group.loc[group.sex==isex].shape[0]
    c = b/a
    
    d = group.loc[group.sex==isex].fare.mean()
    e = group.loc[(group.sex==isex) & (group.embark_town==itown)].fare.mean()
    
    f = group.loc[group.fare.notnull()].shape[0]
    g = group.query('deck==deck').shape[0]
    
    h = np.where(c>0.6,group.age.mean(),0)
    
    cols = ['count_obs','count_males','male_ratio',
            'avg_male_fare','avg_southampton_male_fare',
            'not_null_fare','not_null_deck',
            'kpi_1']
    
    return pd.Series([a,b,c,d,e,f,g,h], index=cols)

In [None]:
dfw = df.groupby('pclass').apply(my_aggregations).reset_index().round(2)
dfw

Unnamed: 0,pclass,count_obs,count_males,male_ratio,avg_male_fare,avg_southampton_male_fare,not_null_fare,not_null_deck,kpi_1
0,1,216,122,0.56,67.23,52.95,216,175,0.0
1,2,184,108,0.59,19.74,19.23,184,16,0.0
2,3,491,347,0.71,12.66,13.31,491,12,25.14061971830986


### Having operation ; Filter on grouped object

In [None]:
df.shape

(891, 15)

In [None]:
df.groupby(['sex','pclass','embark_town','alive']).filter(lambda g : len(g)>20 and g.fare.mean()>10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,0,3,male,25.0,0,0,7.0500,S,Third,man,True,,Southampton,no,True
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False


### Iterate groups

In [None]:
g = df.groupby(['sex','class'])
g.groups.keys()

dict_keys([('female', 'First'), ('female', 'Second'), ('female', 'Third'), ('male', 'First'), ('male', 'Second'), ('male', 'Third')])

In [None]:
g = df.groupby(['sex','class'])
for group_id,data in g:
    print(100*'=')
    print(group_id)
    print(100*'_')
    print(data.head(1))

('female', 'First')
____________________________________________________________________________________________________
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
1         1       1  female  38.0      1      0  71.2833        C  First   

     who  adult_male deck embark_town alive  alone  
1  woman       False    C   Cherbourg   yes  False  
('female', 'Second')
____________________________________________________________________________________________________
   survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
9         1       2  female  14.0      1      0  30.0708        C  Second   

     who  adult_male deck embark_town alive  alone  
9  child       False  NaN   Cherbourg   yes  False  
('female', 'Third')
____________________________________________________________________________________________________
   survived  pclass     sex   age  sibsp  parch   fare embarked  class    who  \
2         1       3  female  26.0  

In [None]:
g.get_group(('male', 'Second')).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
17,1,2,male,,0,0,13.0,S,Second,man,True,,Southampton,yes,True
20,0,2,male,35.0,0,0,26.0,S,Second,man,True,,Southampton,no,True
21,1,2,male,34.0,0,0,13.0,S,Second,man,True,D,Southampton,yes,True
33,0,2,male,66.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
70,0,2,male,32.0,0,0,10.5,S,Second,man,True,,Southampton,no,True


In [None]:
for g,d in df.groupby('sex'):
    print(g)
    print(d.index)

female
Int64Index([  1,   2,   3,   8,   9,  10,  11,  14,  15,  18,
            ...
            866, 871, 874, 875, 879, 880, 882, 885, 887, 888],
           dtype='int64', length=314)
male
Int64Index([  0,   4,   5,   6,   7,  12,  13,  16,  17,  20,
            ...
            873, 876, 877, 878, 881, 883, 884, 886, 889, 890],
           dtype='int64', length=577)


### Pivot Tables

In [None]:
pv = df.pivot_table(index=['sex','pclass'],aggfunc='size')
pv

sex     pclass
female  1          94
        2          76
        3         144
male    1         122
        2         108
        3         347
dtype: int64

In [None]:
pv = df.pivot_table(index=['pclass'],
                    columns=['sex'],
                    values=['fare','age'],
                    aggfunc={'fare':[np.sum, np.mean],
                             'age':['std']})
pv

Unnamed: 0_level_0,age,age,fare,fare,fare,fare
Unnamed: 0_level_1,std,std,mean,mean,sum,sum
sex,female,male,female,male,female,male
pclass,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
1,13.612052,15.13957,106.125798,67.226127,9975.825,8201.5875
2,12.872702,14.793894,21.970121,19.741782,1669.7292,2132.1125
3,12.729964,12.159514,16.11881,12.661633,2321.1086,4393.5865


In [None]:
pv = df[['fare','sex','pclass']].\
    pivot_table(index=['sex','pclass'],
                aggfunc=[len, np.mean, np.std], 
                margins=True, 
                margins_name="TOTAL")
pv

Unnamed: 0_level_0,Unnamed: 1_level_0,len,mean,std
Unnamed: 0_level_1,Unnamed: 1_level_1,fare,fare,fare
sex,pclass,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
female,1.0,94.0,106.125798,74.259988
female,2.0,76.0,21.970121,10.891796
female,3.0,144.0,16.11881,11.690314
male,1.0,122.0,67.226127,77.548021
male,2.0,108.0,19.741782,14.922235
male,3.0,347.0,12.661633,11.681696
TOTAL,,891.0,32.204208,49.665534


In [None]:
pv = df[['sex','pclass','embark_town','fare','survived']].\
    pivot_table(
        index=['sex','pclass'],
        columns=['embark_town','survived'],
        values='fare',
        aggfunc='mean')
pv

Unnamed: 0_level_0,embark_town,Cherbourg,Cherbourg,Queenstown,Queenstown,Southampton,Southampton
Unnamed: 0_level_1,survived,0,1,0,1,0,1
sex,pclass,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,1,28.7125,117.710019,,90.0,151.55,96.743298
female,2,,25.268457,,12.35,18.25,22.272951
female,3,14.653125,14.71722,10.904633,10.084033,21.969018,13.171842
male,1,80.84716,112.197806,90.0,,53.563316,51.832739
male,2,25.418225,25.43335,12.35,,18.997561,20.516667
male,3,8.720842,11.43584,11.84155,12.916667,12.758689,17.03345


### Reshaping

In [None]:
pd.crosstab(index=df['sex'], columns=df['survived'], normalize='index').round(2)

survived,0,1
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.26,0.74
male,0.81,0.19


> * Unstack -> descreases the size of the multiindex (operates from "left to right")
* Stack -> increases the size of the multiindex (operates "top down")
* https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html

In [None]:
pv.unstack(0)
# Move rows to columns : unstack(0) corresponds to sex

embark_town,Cherbourg,Cherbourg,Cherbourg,Cherbourg,Queenstown,Queenstown,Queenstown,Queenstown,Southampton,Southampton,Southampton,Southampton
survived,0,0,1,1,0,0,1,1,0,0,1,1
sex,female,male,female,male,female,male,female,male,female,male,female,male
pclass,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
1,28.7125,80.84716,117.710019,112.197806,,90.0,90.0,,151.55,53.563316,96.743298,51.832739
2,,25.418225,25.268457,25.43335,,12.35,12.35,,18.25,18.997561,22.272951,20.516667
3,14.653125,8.720842,14.71722,11.43584,10.904633,11.84155,10.084033,12.916667,21.969018,12.758689,13.171842,17.03345


In [None]:
pv.stack(0)
# Move columns to rows : stack(0) corresponds to embark_town

Unnamed: 0_level_0,Unnamed: 1_level_0,survived,0,1
sex,pclass,embark_town,Unnamed: 3_level_1,Unnamed: 4_level_1
female,1,Cherbourg,28.7125,117.710019
female,1,Queenstown,,90.0
female,1,Southampton,151.55,96.743298
female,2,Cherbourg,,25.268457
female,2,Queenstown,,12.35
female,2,Southampton,18.25,22.272951
female,3,Cherbourg,14.653125,14.71722
female,3,Queenstown,10.904633,10.084033
female,3,Southampton,21.969018,13.171842
male,1,Cherbourg,80.84716,112.197806
