# Store data efficiently
Decrease Pandas DataFrames size.
---

In [4]:
import pandas as pd
import random

### 1. Cast Categorical Variables

Size goes down from 60kb to 1.6kb

In [17]:
colors = ['WHITE', 'BLACK','RED','YELLOW','BLUE','GREEN']
colors_column = [random.choice(colors) for x in range(1000)]
df = pd.DataFrame({'color':colors_column})
print(df.info(memory_usage="deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   color   1000 non-null   object
dtypes: object(1)
memory usage: 60.4 KB
None


In [19]:
df['color'] =  df['color'].astype('category')
print(df.info(memory_usage="deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   color   1000 non-null   category
dtypes: category(1)
memory usage: 1.6 KB
None


### 2. Cast Boolean Variables

Size goes down from 60kb (strings) and 7.9kb (integers 1 and 0) to 1.1kb

In [45]:
toss = ['Heads', 'Tails']
toss_col = [random.choice(toss) for x in range(1000)]
df = pd.DataFrame({'toss':toss_col})
print(df.info(memory_usage="deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   toss    1000 non-null   object
dtypes: object(1)
memory usage: 60.7 KB
None


In [46]:
df['toss'] =  df['toss'].map({'Heads': True, 'Tails': False})
print(df.info(memory_usage="deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   toss    1000 non-null   bool 
dtypes: bool(1)
memory usage: 1.1 KB
None


In [48]:
toss = [1, 0]
toss_col = [random.choice(toss) for x in range(1000)]
df = pd.DataFrame({'toss':toss_col})
print(df.info(memory_usage="deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   toss    1000 non-null   int64
dtypes: int64(1)
memory usage: 7.9 KB
None


In [49]:
df['toss'] =  df['toss'].astype('bool')
print(df.info(memory_usage="deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   toss    1000 non-null   bool 
dtypes: bool(1)
memory usage: 1.1 KB
None


### 3. Downcast Integers

By default, Pandas creates int64 integer objects, consider downcasting it to int8 when not dealing with huge numbers.

Size goes down from 7.9kb to 1.1kb

In [100]:
years = [2, 3, 4, 5, 6, 7, 8, 9, 1, 2]
year_col = [random.choice(years) for x in range(1000)]
df = pd.DataFrame({'years':year_col})
print(df.info(memory_usage="deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   years   1000 non-null   int64
dtypes: int64(1)
memory usage: 7.9 KB
None


In [101]:
df['years'] = df['years'].astype('int8')
print(df.info(memory_usage="deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   years   1000 non-null   int8 
dtypes: int8(1)
memory usage: 1.1 KB
None


### 4. Putting it all together
Would the size decrease improve the cpu processing time? No.

In [102]:
def super_conditional(row):
    if row[0] in ['RED', 'YELLOW'] and row[1] == False and row[2] == True and row[3] <= 2005:
        return 1
    else:
        return 0

In [103]:
colors = ['WHITE', 'BLACK','RED','YELLOW','BLUE','GREEN']
colors_col = [random.choice(colors) for x in range(1000000)]

toss = ['Heads', 'Tails']
toss_col = [random.choice(toss) for x in range(1000000)]

toss_int = [1, 0]
toss_col_int = [random.choice(toss_int) for x in range(1000000)]

years = [2, 3, 4, 5, 6, 7, 8, 9, 1, 2]
year_col = [random.choice(years) for x in range(1000000)]

df = pd.DataFrame({'colors': colors_col, 'toss': toss_col, 'toss_int': toss_col_int, 'years': year_col})
print(df.info(memory_usage="deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 4 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   colors    1000000 non-null  object
 1   toss      1000000 non-null  object
 2   toss_int  1000000 non-null  int64 
 3   years     1000000 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 133.2 MB
None


In [104]:
df.head()

Unnamed: 0,colors,toss,toss_int,years
0,YELLOW,Tails,1,2
1,GREEN,Heads,0,8
2,BLACK,Tails,0,1
3,GREEN,Tails,1,6
4,BLUE,Tails,1,6


In [105]:
%%time
df.apply(super_conditional, axis=1)

CPU times: user 2.62 s, sys: 31.2 ms, total: 2.65 s
Wall time: 2.65 s


0         0
1         0
2         0
3         0
4         0
         ..
999995    0
999996    0
999997    0
999998    0
999999    0
Length: 1000000, dtype: int64

In [106]:
df['colors'] = df['colors'].astype('category')
df['toss'] = df['toss'].map({'Heads': True, 'Tails': False})
df['toss_int'] = df['toss_int'].astype('bool')
df['years'] = df['years'].astype('int8')
print(df.info(memory_usage="deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 4 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   colors    1000000 non-null  category
 1   toss      1000000 non-null  bool    
 2   toss_int  1000000 non-null  bool    
 3   years     1000000 non-null  int8    
dtypes: bool(2), category(1), int8(1)
memory usage: 3.8 MB
None


In [107]:
df.head()

Unnamed: 0,colors,toss,toss_int,years
0,YELLOW,False,True,2
1,GREEN,True,False,8
2,BLACK,False,False,1
3,GREEN,False,True,6
4,BLUE,False,True,6


In [108]:
%%time
df.apply(super_conditional, axis=1)

CPU times: user 2.81 s, sys: 31 ms, total: 2.84 s
Wall time: 2.84 s


0         1
1         0
2         0
3         0
4         0
         ..
999995    1
999996    0
999997    0
999998    0
999999    0
Length: 1000000, dtype: int64