In [8]:
import pandas as pd
import time
import warnings
warnings.filterwarnings('ignore')

In [14]:
filename = 'logs.csv' # CSV file address 

In [10]:
def read_file(filename):
    start_time = time.time()
    data = pd.read_csv(filename)
    print('File read in {:03.2f} seconds'.format(time.time() - start_time))
    return data

def show_mem_usage_for_pandas_obj(pandas_obj):
    return data.info(memory_usage = 'deep')

def show_mem_usage_for_dtype(pandas_obj):
    if isinstance(pandas_obj, pd.DataFrame):
        # object is dataframe
        for dtype in ['float','int','object']:
            selected_dtype = pandas_obj.select_dtypes(include = [dtype])
            mean_usage_b = selected_dtype.memory_usage(deep = True).mean()
            mean_usage_mb = mean_usage_b / 1024 ** 2 # convert bytes to megabytes
            print('Memory usage for {} columns: {:03.2f} MB'.format(dtype, mean_usage_mb))

def get_mem_usage_for_pandas_obj(pandas_obj):
    if isinstance(pandas_obj, pd.DataFrame):
        # object is dataframe
        usage_b = pandas_obj.memory_usage(deep = True).sum()
    else:
        # object is series
        usage_b = pandas_obj.memory_usage(deep = True)
    usage = usage_b / 1024 ** 2 # convert bytes to megabytes
    return '{:03.2f} MB'.format(usage)

def compare_dtypes_optimization(dtype_1, dtype_2):
    comparison = pd.concat([dtype_1.dtypes, dtype_2.dtypes], axis = 1)
    comparison.columns = ['before', 'after']
    comparison.apply(pd.Series.value_counts)
    return comparison

def optimize_numeric_int_columns(pandas_obj, compare = False, inplace = False):
    data_int = pandas_obj.select_dtypes(include = ['int'])
    optimized_int = data_int.apply(pd.to_numeric, downcast = 'unsigned')
    print('Before: ', get_mem_usage_for_pandas_obj(data_int), 'After: ', get_mem_usage_for_pandas_obj(optimized_int))
    if compare == True:
        compare_dtypes_optimization(data_int, optimized_int)
    if inplace == True:
        return optimized_int
    
def optimize_numeric_float_columns(pandas_obj, compare = False, inplace = False):
    data_float = pandas_obj.select_dtypes(include = ['float'])
    optimized_float = data_float.apply(pd.to_numeric, downcast = 'float')
    print('Before: ', get_mem_usage_for_pandas_obj(data_float), 'After: ', get_mem_usage_for_pandas_obj(optimized_float))
    if compare == True:
        compare_dtypes_optimization(data_float, optimized_float)
    if inplace == True:
        return optimized_float

def categorize_object_columns(pandas_obj, compare = False, inplace = False):
    data_obj = pandas_obj.select_dtypes(include = ['object']).copy()
    optimized_obj = pd.DataFrame()
    for column in data_obj.columns:
        total_values = len(data_obj[column])
        unique_values = len(data_obj[column].unique())
        if unique_values / total_values < 0.5: # estimate necessity of classification
            optimized_obj.loc[:, column] = data_obj[column].astype('category')
        else:
            optimized_obj.loc[:, column] = data_obj[col]
    print('Before: ', get_mem_usage_for_pandas_obj(data_obj), 'After: ', get_mem_usage_for_pandas_obj(optimized_obj))
    if compare == True:
        compare_dtypes_optimization(data_obj, optimized_obj)
    if inplace == True:
        return optimized_obj

In [16]:
data = read_file(filename)
get_mem_usage_for_pandas_obj(data)

File read in 3.09 seconds


'861.57 MB'

In [11]:
optimize_numeric_int_columns(data)

Before:  7.87 MB After:  1.48 MB


In [12]:
optimize_numeric_float_columns(data)

Before:  100.99 MB After:  50.49 MB


In [13]:
categorize_object_columns(data)

Before:  752.72 MB After:  51.67 MB
