# Aggregations in Groups of Rows
**Problem statement:** Given a dataset with multiple columns, you'd like to group by one (or more) columns and apply an aggregation function to these groups. 

In [20]:
import pandas as pd
import seaborn as sns
df = (sns.load_dataset('flights'))

In [4]:
df.shape

(144, 3)

In [5]:
df.head()

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121


In [6]:
def groupby_in_forloop(d):
    grouped = d.groupby("month")
    result = []
    for name, group in grouped :
        result += [[name, sum(group.passengers)]]
    
    result = pd.DataFrame(result)
    result.columns = ["month", "sum_passengers"]
    return result

In [7]:
def groupby_aggregation(d):
    grouped = d.groupby("month")
    
    result = grouped.agg({"passengers":"sum"})
    
    result = result.reset_index()
    result.columns = ["month", "sum_passengers"]
    return result

In [8]:
def groupby_aggregation_2(d):
    grouped = d.groupby("month")
    
    result = grouped.sum()
    
    result = result.reset_index().drop(columns=["year"])
    result.columns = ["month", "sum_passengers"]
    return result

In [9]:
def iterate_df_in_forloop(d):
    result = []
    for m in d.month.unique():
        current = d.loc[lambda d: (d.month == m)]
        result += [[m, sum(current.passengers)]]
    
    result = pd.DataFrame(result)
    result.columns = ["month", "sum_passengers"]
   
    return result

In [10]:
def groupby_apply(d):
    grouped = d.groupby("month")
    
    result = grouped.apply(lambda d: sum(d.passengers))
    
    result = pd.DataFrame(result).reset_index()
    result.columns = ["month", "sum_passengers"]

    return result

In [11]:
%%timeit
iterate_df_in_forloop(df)

6.26 ms ± 67.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
%%timeit
groupby_in_forloop(df)

2.55 ms ± 71.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
%%timeit
groupby_aggregation(df)

2.7 ms ± 385 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
%%timeit
groupby_apply(df)

3.26 ms ± 71.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
%%timeit
groupby_aggregation_2

28.8 ns ± 0.401 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [16]:
pd.testing.assert_frame_equal(
    groupby_in_forloop(df), 
    iterate_df_in_forloop(df))

In [17]:
# grouping and resetting index results in the index column being converted to Categorical
pd.testing.assert_frame_equal(
    groupby_in_forloop(df), 
    groupby_aggregation(df), 
    check_dtype=False,
    check_categorical=False
)

In [18]:
pd.testing.assert_frame_equal(
    groupby_in_forloop(df),
    groupby_apply(df),
    check_dtype=False,
    check_categorical=False
)

In [19]:
pd.testing.assert_frame_equal(
    groupby_in_forloop(df),
    groupby_aggregation_2(df),
    check_dtype=False,
    check_categorical=False
)