# Fast Pandas

In [2]:
import numpy as np
import logging
import rich
import os
import sys
import pandas as pd
import seaborn as sns
import pytest

In [73]:
df = (sns.load_dataset('flights'))

In [74]:
df.shape

(144, 3)

In [75]:
df.head()

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121


## Iterating groups

In [10]:
def groupby_in_forloop(d):
    grouped = d.groupby("month")
    result = []
    for name, group in grouped :
        result += [[name, sum(group.passengers)]]
    
    result = pd.DataFrame(result)
    result.columns = ["month", "sum_passengers"]
    return result

In [43]:
def groupby_aggregation(d):
    grouped = d.groupby("month")
    
    result = grouped.agg({"passengers":"sum"})
    
    result = result.reset_index()
    result.columns = ["month", "sum_passengers"]
    return result

In [65]:
def groupby_aggregation_2(d):
    grouped = d.groupby("month")
    
    result = grouped.sum()
    
    result = result.reset_index().drop(columns=["year"])
    result.columns = ["month", "sum_passengers"]
    return result

In [76]:
def iterate_df_in_forloop(d):
    result = []
    for m in d.month.unique():
        current = d.loc[lambda d: (d.month == m)]
        result += [[m, sum(current.passengers)]]
    
    result = pd.DataFrame(result)
    result.columns = ["month", "sum_passengers"]
   
    return result

In [64]:
def groupby_apply(d):
    grouped = d.groupby("month")
    
    result = grouped.apply(lambda d: sum(d.passengers))
    
    result = pd.DataFrame(result).reset_index()
    result.columns = ["month", "sum_passengers"]

    return result

In [9]:
%%timeit
iterate_df_in_forloop(df)

3.56 ms ± 34.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
%%timeit
groupby_in_forloop(df)

1.27 ms ± 9.11 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [49]:
%%timeit
groupby_aggregation(df)

1.41 ms ± 1.96 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [55]:
%%timeit
groupby_apply(df)

2.03 ms ± 122 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [67]:
%%timeit
groupby_aggregation_2

20.5 ns ± 0.0283 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [None]:
pd.testing.assert_frame_equal(
    groupby_in_forloop(df), 
    iterate_df_in_forloop(df))

In [48]:
# grouping and resetting index results in the index column being converted to Categorical
pd.testing.assert_frame_equal(
    groupby_in_forloop(df), 
    groupby_aggregation(df), 
    check_dtype=False,
    check_categorical=False
)

In [58]:
pd.testing.assert_frame_equal(
    groupby_in_forloop(df),
    groupby_apply(df),
    check_dtype=False,
    check_categorical=False
)

In [72]:
pd.testing.assert_frame_equal(
    groupby_in_forloop(df),
    groupby_aggregation_2(df),
    check_dtype=False,
    check_categorical=False
)