Pandas IO test
Pickle has the best performance

In [2]:
import os
import pandas as pd
import sqlite3
from numpy.random import randn
from pandas.io import sql

sz = 1000000
df = pd.DataFrame({'A': randn(sz), 'B': [1] * sz})

def test_sql_write(df):
    if os.path.exists('test.sql'):
        os.remove('test.sql')
    sql_db = sqlite3.connect('test.sql')
    df.to_sql(name='test_table', con=sql_db)
    sql_db.close()

def test_sql_read():
    sql_db = sqlite3.connect('test.sql')
    pd.read_sql_query("select * from test_table", sql_db)
    sql_db.close()

def test_hdf_fixed_write(df):
    df.to_hdf('test_fixed.hdf','test',mode='w')

def test_hdf_fixed_read():
    pd.read_hdf('test_fixed.hdf','test')

def test_hdf_fixed_write_compress(df):
    df.to_hdf('test_fixed_compress.hdf','test',mode='w',complib='blosc')

def test_hdf_fixed_read_compress():
    pd.read_hdf('test_fixed_compress.hdf','test')

def test_hdf_table_write(df):
    df.to_hdf('test_table.hdf','test',mode='w',format='table')

def test_hdf_table_read():
    pd.read_hdf('test_table.hdf','test')

def test_hdf_table_write_compress(df):
    df.to_hdf('test_table_compress.hdf','test',mode='w',complib='blosc',format='table')

def test_hdf_table_read_compress():
    pd.read_hdf('test_table_compress.hdf','test')

def test_csv_write(df):
    df.to_csv('test.csv',mode='w')

def test_csv_read():
    pd.read_csv('test.csv',index_col=0)

def test_feather_write(df):
    df.to_feather('test.feather')

def test_feather_read():
    pd.read_feather('test.feather')

def test_pickle_write(df):
    df.to_pickle('test.pkl')

def test_pickle_read():
    pd.read_pickle('test.pkl')

def test_pickle_write_compress(df):
    df.to_pickle('test.pkl.compress', compression='xz')

def test_pickle_read_compress():
    pd.read_pickle('test.pkl.compress', compression='xz')

In [5]:
# write test
%timeit test_sql_write(df)
%timeit test_hdf_fixed_write(df)
%timeit test_hdf_fixed_write_compress(df)
%timeit test_hdf_table_write(df)
%timeit test_hdf_table_write_compress(df)
%timeit test_csv_write(df)
%timeit test_feather_write(df)
%timeit test_pickle_write(df)
%timeit test_pickle_write_compress(df)



2.56 s ± 9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
22.1 ms ± 726 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
83 ms ± 1.66 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
493 ms ± 5.42 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
551 ms ± 10.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
3.82 s ± 3.83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
14.8 ms ± 163 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
31.8 ms ± 640 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5.17 s ± 28.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
%timeit test_sql_read()
%timeit test_hdf_fixed_read()
%timeit test_hdf_fixed_read_compress()
%timeit test_hdf_table_read()
%timeit test_hdf_table_read_compress()
%timeit test_csv_read()
%timeit test_feather_read()
%timeit test_pickle_read()
%timeit test_pickle_read_compress()

# clean up
if os.path.exists('test.csv'):
    os.remove('test.csv')
if os.path.exists('test.feather'):
    os.remove('test.feather')
if os.path.exists('test.pkl'):
    os.remove('test.pkl')
if os.path.exists('test.pkl.compress'):
    os.remove('test.pkl.compress')
if os.path.exists('test.sql'):
    os.remove('test.sql')
if os.path.exists('test_fixed.hdf'):
    os.remove('test_fixed.hdf')
if os.path.exists('test_fixed_compress.hdf'):
    os.remove('test_fixed_compress.hdf')
if os.path.exists('test.sql'):
    os.remove('test_remote_debug.py')
if os.path.exists('test_table.hdf'):
    os.remove('test_table.hdf')
if os.path.exists('test_table_compress.hdf'):
    os.remove('test_table_compress.hdf')

4.91 s ± 114 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
43 ms ± 4.41 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
81.6 ms ± 2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
83.1 ms ± 1.82 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
104 ms ± 2.86 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
1.46 s ± 34.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
22 ms ± 1.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
30.8 ms ± 1.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
2.8 s ± 29.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
