# Efficient Memory use in Pandas

In [1]:
import pandas as pd
import numpy as np

## Create our Data

In [2]:
def get_dataset(size):
    df = pd.DataFrame()

    df['position'] = np.random.choice(['left', 'middle', 'right'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red', 'blue', 'yellow', 'green'], size)
    df['win'] = np.random.choice(['yes', 'no'], size)
    df['prob'] = np.random.uniform(0, 1, size)
    
    return df

In [None]:
df = get_dataset(1_000_000)

In [None]:
df.info()

In [None]:
%timeit df.groupby(['team', 'position'])['age'].rank()
%timeit df.groupby(['team', 'position'])['prob'].rank()

In [None]:
df['position'] = df['position'].astype('category')
df['team'] = df['team'].astype('category')
df['win'] = df['win'].map({'yes': True, 'no': False})

In [None]:
df.info()

In [None]:
%timeit df.groupby(['team', 'position'])['age'].rank()
%timeit df.groupby(['team', 'position'])['prob'].rank()

In [None]:
df['age'].max()

In [None]:
df['age'] = df['age'].astype('int8')

In [None]:
df.info()

In [None]:
%timeit df.groupby(['team', 'position'])['age'].rank()
%timeit df.groupby(['team', 'position'])['prob'].rank()

In [None]:
df['prob'] = df['prob'].astype('float32')

In [None]:
df.info()

In [None]:
%timeit df.groupby(['team', 'position'])['age'].rank()
%timeit df.groupby(['team', 'position'])['prob'].rank()

In [3]:
def set_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    df['position'] = df['position'].astype('category')
    df['team'] = df['team'].astype('category')
    df['win'] = df['win'].map({'yes': True, 'no': False})
    df['age'] = df['age'].astype('int8')
    df['prob'] = df['prob'].astype('float32')
    
    return df

In [4]:
df = get_dataset(1_000_000)
%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'])['prob'].rank()

801 ms ± 173 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
950 ms ± 50.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.19 s ± 163 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
df = get_dataset(1_000_000)
df = set_dtypes(df)
%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'])['prob'].rank()

743 ms ± 154 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
849 ms ± 263 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
777 ms ± 55.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
df = get_dataset(10_000_000)
%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'])['prob'].rank()

9.29 s ± 1.71 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
8.81 s ± 1.85 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
9.16 s ± 1.03 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
df = get_dataset(10_000_000)
df = set_dtypes(df)
%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'])['prob'].rank()

2.55 s ± 212 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
4.89 s ± 240 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
9.27 s ± 3.03 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
