In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np

In [2]:
pandas_df = pd.read_pickle("./raw_weekly_df.pkl")
dask_df = dd.from_pandas(pandas_df, npartitions=8)

In [3]:
(pandas_df.info(memory_usage="Deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6841121 entries, 0 to 6841120
Data columns (total 7 columns):
 #   Column    Dtype         
---  ------    -----         
 0   dept_id   category      
 1   cat_id    category      
 2   item_id   category      
 3   state_id  category      
 4   store_id  category      
 5   datetime  datetime64[ns]
 6   sales     float32       
dtypes: category(5), datetime64[ns](1), float32(1)
memory usage: 117.5 MB


In [4]:
'''
The only place where we can save memory is potentially changing the float32 column to a less data intensive float 
type. Everything else is a category or datetime.
'''

In [4]:
#Downcast in order to save memory
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            df[cols[i]] = pd.to_numeric(df[cols[i]], downcast='integer')
        elif 'float' in str(t):
            df[cols[i]] = pd.to_numeric(df[cols[i]], downcast='float')
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df  


In [5]:
'''
The above downcast won't actually help, since the only place where our data can save any memory is float, and 
the above method just downcasts it to a float32, which is already the case. But this could be useful for future 
datasets. 
'''

"\nThe above downcast won't actually help, since the only place where our data can save any memory is float, and \nthe above method just downcasts it to a float32, which is already the case. But this could be useful for future \ndatasets. \n"

In [6]:
downcast(pandas_df).info(memory_usage="Deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6841121 entries, 0 to 6841120
Data columns (total 7 columns):
 #   Column    Dtype         
---  ------    -----         
 0   dept_id   category      
 1   cat_id    category      
 2   item_id   category      
 3   state_id  category      
 4   store_id  category      
 5   datetime  datetime64[ns]
 6   sales     float32       
dtypes: category(5), datetime64[ns](1), float32(1)
memory usage: 117.5 MB
