# Group by / Aggregation

* [pandas.DataFrame.groupby](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html)

```
DataFrame.groupby(
    by=None, 
    axis=0, 
    level=None, 
    as_index=True, 
    sort=True, 
    group_keys=True, 
    squeeze=NoDefault.no_default,
    observed=False, 
    dropna=True
)

Returns: DataFrameGroupBy (a groupby object that contains information about the groups).
```

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
pd.set_option('precision', 2)

In [19]:
df = pd.read_csv(
    "./data/titanic.csv"
)
df.head()

Unnamed: 0,survived,pclass,name,sex,age,siblings_spouses,parents_children,fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.28
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.92
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   survived          887 non-null    int64  
 1   pclass            887 non-null    int64  
 2   name              887 non-null    object 
 3   sex               887 non-null    object 
 4   age               887 non-null    float64
 5   siblings_spouses  887 non-null    int64  
 6   parents_children  887 non-null    int64  
 7   fare              887 non-null    float64
dtypes: float64(2), int64(4), object(2)
memory usage: 55.6+ KB


In [16]:
df.describe()

Unnamed: 0,survived,pclass,age,siblings_spouses,parents_children,fare
count,887.0,887.0,887.0,887.0,887.0,887.0
mean,0.39,2.31,29.47,0.53,0.38,32.31
std,0.49,0.84,14.12,1.1,0.81,49.78
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.25,0.0,0.0,7.92
50%,0.0,3.0,28.0,0.0,0.0,14.45
75%,1.0,3.0,38.0,1.0,0.0,31.14
max,1.0,3.0,80.0,8.0,6.0,512.33


In [34]:
df.shape

(887, 8)

In [37]:
pd.Series(df.shape, index=['nrow', 'ncol'])

nrow    887
ncol      8
dtype: int64

---

# Group By passenger class

## Single Aggregation

In [11]:
df.groupby(['pclass']).median()

Unnamed: 0_level_0,survived,age,siblings_spouses,parents_children,fare
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,38.5,0.0,0.0,60.29
2,0.0,29.0,0.0,0.0,14.25
3,0.0,24.0,0.0,0.0,8.05


In [18]:
fix, ax = plt.subplots(figsize=(10, 6))
ax.plot(x=df.index, y=)

RangeIndex(start=0, stop=887, step=1)

In [12]:
df.groupby(['pclass']).mean()

Unnamed: 0_level_0,survived,age,siblings_spouses,parents_children,fare
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.63,38.79,0.42,0.36,84.15
2,0.47,29.87,0.4,0.38,20.66
3,0.24,25.19,0.62,0.4,13.71


In [13]:
df.groupby(['pclass']).std()

Unnamed: 0_level_0,survived,age,siblings_spouses,parents_children,fare
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.48,14.18,0.61,0.69,78.38
2,0.5,13.76,0.6,0.69,13.42
3,0.43,12.1,1.38,0.89,11.82


## Single aggregation on specific column

In [25]:
df.groupby(['pclass'])[
    ['age']
].mean()

Unnamed: 0_level_0,age
pclass,Unnamed: 1_level_1
1,38.79
2,29.87
3,25.19


## Multiple Aggregations

In [28]:
df.groupby(['pclass']).agg(
    [np.mean, np.std, np.median]
)

Unnamed: 0_level_0,survived,survived,survived,age,age,age,siblings_spouses,siblings_spouses,siblings_spouses,parents_children,parents_children,parents_children,fare,fare,fare
Unnamed: 0_level_1,mean,std,median,mean,std,median,mean,std,median,mean,std,median,mean,std,median
pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
1,0.63,0.48,1.0,38.79,14.18,38.5,0.42,0.61,0.0,0.36,0.69,0.0,84.15,78.38,60.29
2,0.47,0.5,0.0,29.87,13.76,29.0,0.4,0.6,0.0,0.38,0.69,0.0,20.66,13.42,14.25
3,0.24,0.43,0.0,25.19,12.1,24.0,0.62,1.38,0.0,0.4,0.89,0.0,13.71,11.82,8.05


In [29]:
df.groupby(['pclass'])[
    ['age']
].agg([np.mean, np.std, np.median])

Unnamed: 0_level_0,age,age,age
Unnamed: 0_level_1,mean,std,median
pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,38.79,14.18,38.5
2,29.87,13.76,29.0
3,25.19,12.1,24.0


In [33]:
df.groupby(['pclass']).agg(
    age_mean=('age', lambda x: np.mean(x)),
    fare_mean=('fare', lambda x: np.mean(x))
)

Unnamed: 0_level_0,age_mean,fare_mean
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.79,84.15
2,29.87,20.66
3,25.19,13.71
