### Test fast ways to save/load pandas DataFrame 
#### Result: pickle and feather are ~40 times faster than CSV
I prefer pickle, because it allows me to save a "bag" object (basically a dictionary) containing several DataFrames and parameters, which can be arranged as trees.
<br>--------------------
<br>Follow the link to see excellent analysis by Ilia Zaitsev, where he compares more formats (CSV, Pickle, MessagePack, HDF5, Feather, Parquet):
 - <a href="https://towardsdatascience.com/the-best-format-to-save-pandas-data-414dca023e0d" target="_blank">https://towardsdatascience.com/the-best-format-to-save-pandas-data-414dca023e0d</a>

In [36]:
import os, sys, pickle, json,time
# conda install feather-format -c conda-forge
import pandas as pd
nb_time_start = time.time()

In [37]:
# procedure to create a Pandas DataFrame
def ddd(nrows=10):
    """
    # returns a simple pandas DataFrame - useful for quick tests
    # nrows is number of rows (divisible by 10), for example:
    #     df = ddd()
    #     df = ddd(100)
    #     df = ddd(10**6)   # million rows
    """
    n_aa = 10
    nn = int(nrows/n_aa)
    if nn < 1:
        nn = 1
    aa = pd.DataFrame({
          'ii':nn*[0,1,2,3,4,5,np.nan,7,8,9],
          'i1':nn*[6,5,4,3,2,1,0,-1,-2,-3],
          'i2':nn*[6,5,4,4,1,1,0,-1,-2,-3],
          'ff':nn*[0.0,1.0,2.0,np.NaN,4.0,5.0,6.0,7.0,8.0,9.0],
          'f1':nn*[0.0,1.01,2.002,3.0003,4.00004,5.000005,6.0000006,7.0,8.0,9.0],
          'f2':nn*[1.11,2.22,3.33,4.44,5.55,7.77,9.99,0.01,-0.01,-1.11],
          'ss':nn*['s0','s1','狗','汽车',np.nan,'s5','s6','s7','s8','s9'],
          's1':nn*list(np.array(['s0','s1','s2','s2',np.nan,'s5','s6','s7','s8','s9'],dtype=np.str)),
          's2':nn*['1.11','2.22','3.33','4.44','5.55','7.77','9.99','0.01','-0.01','-1.11'],
          'bb':nn*[True, False, True, False, np.nan, False, True,np.nan, False, True],
          'b1':nn*[True, False, True, False, True, False, True, True, False, True],
          'xx':nn*list(range(n_aa)),
          'yy':nn*[x*50 + 60 + np.random.randn() for x in range(n_aa)]
    })
    aa = aa[['ii','i1','i2','ff','f1','f2','ss','s1','s2','bb','b1','xx','yy']].copy()
    aa.index = range(len(aa))

    return aa

In [38]:
# create DF with 1 Mln rows
df=ddd(10**6)

In [39]:
%%time
fname_csv = "/tmp/ff.csv"
%timeit -n 1 -r 1 df.to_csv(fname_csv)            # 10 sec
%timeit -n 1 -r 1 bb = pd.read_csv(fname_csv)       # 1.2 sec

9.66 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
1.25 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
CPU times: user 11.2 s, sys: 280 ms, total: 11.4 s
Wall time: 10.9 s


In [40]:
%%time
fname_fe = "/tmp/ff.feather"
%timeit -n 1 -r 5 df.to_feather(fname_fe)    #  0.3 sec
%timeit -n 1 -r 5 bb=pd.read_feather(fname_fe) # 0.1 sec

237 ms ± 24.8 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)
79.4 ms ± 1.08 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)
CPU times: user 1.73 s, sys: 375 ms, total: 2.1 s
Wall time: 1.63 s


In [41]:
%%time
fname_pk = "/tmp/ff.pk"
%timeit -n 1 -r 5 pickle.dump(df, open(fname_pk, "wb")) # 0.21 sec
%timeit -n 1 -r 5 bb = pickle.load(open(fname_pk, "rb")) # 0.16 sec

235 ms ± 33 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)
129 ms ± 10.7 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)
CPU times: user 1.02 s, sys: 415 ms, total: 1.44 s
Wall time: 1.88 s


In [42]:
# demo that pickle saves deep nested "trees":

mydict = {
    "level1_k1" : "v1",
    "level1_k2" : {
        "level2_k1" : "vv1",
        "level2_k2" : {
            "level3_k1" : "vvv1",
            "level3_k2" : {
                "level4_k1" : "vvvv1",
                "level4_k2" : [1,2,3,4,5]
                
            }
        }
    }
}

print("writing dict to pickle file")
fname_pk = "/tmp/ff.pk"
pickle.dump(mydict, open(fname_pk, "wb"))

print("reading back from file")
dict_from_file = pickle.load(open(fname_pk, "rb"))

print("showing what we have read from file:")
json_str = json.dumps(dict_from_file,indent=4)
print(json_str)

writing dict to pickle file
reading back from file
showing what we have read from file:
{
    "level1_k1": "v1",
    "level1_k2": {
        "level2_k1": "vv1",
        "level2_k2": {
            "level3_k1": "vvv1",
            "level3_k2": {
                "level4_k1": "vvvv1",
                "level4_k2": [
                    1,
                    2,
                    3,
                    4,
                    5
                ]
            }
        }
    }
}


In [43]:
nb_time_end = time.time()
total_seconds = nb_time_end - nb_time_start
print(f"Total notebook run time = {total_seconds:.2f} sec")

Total notebook run time = 15.83 sec
