# SpeedUp Pandas

- Speed Up Your Pandas Dataframes
https://www.youtube.com/watch?v=u4_c2LDi4b8

In [22]:
import pandas as pd
import numpy as np

# Creating our Data

In [23]:
def get_dataset(size):
    df = pd.DataFrame()
    df['position'] = np.random.choice(['left','middle','right'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red','blue','yellow','green', size])
    df['win'] = np.random.choice(['yes','no'], size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

In [24]:
df = get_dataset(1_000_000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   position  1000000 non-null  object 
 1   age       1000000 non-null  int64  
 2   team      1000000 non-null  object 
 3   win       1000000 non-null  object 
 4   prob      1000000 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 38.1+ MB


# Type Casting

In [25]:
df['position'] = df['position'].astype('category')
df['team'] = df['team'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int64   
 2   team      1000000 non-null  category
 3   win       1000000 non-null  object  
 4   prob      1000000 non-null  float64 
dtypes: category(2), float64(1), int64(1), object(1)
memory usage: 24.8+ MB


## Int Downcasting Value Range

- int8 can store integers from -128 to 127.
- int16 can store integers from -32768 to 32767.
- int64 can store integers from -9223372036854775808 to 9223372036854775807.

In [26]:
df['age']

0         27
1         21
2         47
3         31
4         30
          ..
999995    40
999996    17
999997     4
999998     2
999999    45
Name: age, Length: 1000000, dtype: int64

In [27]:
df['age'].describe()

count    1000000.000000
mean          25.010694
std           14.141164
min            1.000000
25%           13.000000
50%           25.000000
75%           37.000000
max           49.000000
Name: age, dtype: float64

In [28]:
df['age'] = df['age'].astype('int8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int8    
 2   team      1000000 non-null  category
 3   win       1000000 non-null  object  
 4   prob      1000000 non-null  float64 
dtypes: category(2), float64(1), int8(1), object(1)
memory usage: 18.1+ MB


## Downcasting Floats

In [29]:
df['prob'] = df['prob'].astype('float32')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int8    
 2   team      1000000 non-null  category
 3   win       1000000 non-null  object  
 4   prob      1000000 non-null  float32 
dtypes: category(2), float32(1), int8(1), object(1)
memory usage: 14.3+ MB


## Casting bool (true/false)

In [30]:
df['win'] = df['win'].map({'yes':True, 'no':False})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   position  1000000 non-null  category
 1   age       1000000 non-null  int8    
 2   team      1000000 non-null  category
 3   win       1000000 non-null  bool    
 4   prob      1000000 non-null  float32 
dtypes: bool(1), category(2), float32(1), int8(1)
memory usage: 7.6 MB


In [31]:
def set_dtypes(df):
    df['position'] = df['position'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int8')
    df['prob'] = df['prob'].astype('float32')
    df['win'] = df['win'].map({'yes':True, 'no':False})
    return df

# Testing

資料量愈大，是否有做 dtype downcasting 的影響就愈大

In [32]:
df = get_dataset(5_000_000)
%timeit df['age_rank'] = df.groupby(['team','position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team','position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team','position','win'])['prob'].rank()

1.7 s ± 53.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.52 s ± 246 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.88 s ± 176 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [33]:
df.head()

Unnamed: 0,position,age,team,win,prob,age_rank,prob_rank,win_prob_rank
0,middle,43,yellow,yes,0.183881,1445485.5,306750.0,153481.0
1,right,32,yellow,yes,0.559759,1070950.0,932018.0,467068.0
2,right,13,yellow,no,0.140224,425089.0,233270.0,116209.0
3,left,39,yellow,no,0.578957,1310174.5,965348.0,483180.0
4,left,32,yellow,no,0.03248,1071946.5,53755.0,26727.0


In [34]:
df = get_dataset(5_000_000)
df = set_dtypes(df)
%timeit df['age_rank'] = df.groupby(['team','position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team','position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team','position','win'])['prob'].rank()

684 ms ± 9.32 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.47 s ± 87.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.77 s ± 155 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
df.head()

Unnamed: 0,position,age,team,win,prob,age_rank,prob_rank,win_prob_rank
0,middle,33,yellow,True,0.203457,1106318.0,338880.0,169956.0
1,middle,46,yellow,False,0.133363,1548311.0,222342.0,110812.0
2,right,39,yellow,True,0.696892,1311768.5,1163697.0,581448.0
3,middle,31,yellow,False,0.541664,1038214.0,904101.0,451296.0
4,left,12,yellow,True,0.629097,390767.0,1046315.0,523474.0


: 