# Storing Financial Data Efficiently

### Storing DataFrame Objects

In [41]:
import pandas as pd
from sample_data import generate_sample_data

In [42]:
print(generate_sample_data(rows=5, cols=4)) # printing sample financial data set

                            No0         No1         No2         No3
2021-01-01 00:00:00  100.000000  100.000000  100.000000  100.000000
2021-01-01 00:01:00   99.946060   99.923605  100.014555  100.041542
2021-01-01 00:02:00   99.936160   99.811525   99.989511  100.020116
2021-01-01 00:03:00   99.943931   99.743754  100.014933  100.052796
2021-01-01 00:04:00   99.933024   99.679133   99.933961  100.051076


In [43]:
# creating sample data set for storing example
%time
data = generate_sample_data(rows=5e6, cols=10).round(4) # 5.000.000 rows, 10 cols

CPU times: user 5 μs, sys: 1 μs, total: 6 μs
Wall time: 16 μs


In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5000000 entries, 2021-01-01 00:00:00 to 2030-07-05 05:19:00
Freq: min
Data columns (total 10 columns):
 #   Column  Dtype  
---  ------  -----  
 0   No0     float64
 1   No1     float64
 2   No2     float64
 3   No3     float64
 4   No4     float64
 5   No5     float64
 6   No6     float64
 7   No7     float64
 8   No8     float64
 9   No9     float64
dtypes: float64(10)
memory usage: 419.6 MB


In [45]:
# opening HDFStore object on disc for writing
h5 = pd.HDFStore('../../assets/hdf_store/data.h5', 'w')

In [46]:
%time h5['data'] = data # writing data frame object to disc

CPU times: user 5.98 ms, sys: 59.2 ms, total: 65.2 ms
Wall time: 168 ms


In [47]:
h5 # printing out meta information of database file

<class 'pandas.io.pytables.HDFStore'>
File path: ../../assets/hdf_store/data.h5

In [48]:
ls -n ../../assets/hdf_store/data.*

-rw-r--r--  1 501  20  440007240 20 Sep 10:19 ../../assets/hdf_store/data.h5


In [49]:
h5.close() # closing database file

In [50]:
# reading data from the file based HDFStore object
h5 = pd.HDFStore('../../assets/hdf_store/data.h5', 'r')

In [51]:
%time data_copy = h5['data'] # reading from stored file

CPU times: user 155 ms, sys: 320 ms, total: 475 ms
Wall time: 873 ms


In [52]:
data_copy.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5000000 entries, 2021-01-01 00:00:00 to 2030-07-05 05:19:00
Freq: min
Data columns (total 10 columns):
 #   Column  Dtype  
---  ------  -----  
 0   No0     float64
 1   No1     float64
 2   No2     float64
 3   No3     float64
 4   No4     float64
 5   No5     float64
 6   No6     float64
 7   No7     float64
 8   No8     float64
 9   No9     float64
dtypes: float64(10)
memory usage: 419.6 MB


In [53]:
h5.close()

In [54]:
rm ../../assets/hdf_store/data.h5 # deleting file

In [55]:
# alternative way of storing a HDFStore that allows to append new data to the table
# and for searching over the table
# pandas.DataFrame.to_hdf
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_hdf.html
%time data.to_hdf('../../assets/hdf_store/data_q.h5', key='data', mode='w', format='table')

CPU times: user 6.58 s, sys: 2.84 s, total: 9.42 s
Wall time: 17.8 s


In [56]:
ls -n ../../assets/hdf_store/data_q.*

-rw-r--r--  1 501  20  446953369 20 Sep 10:19 ../../assets/hdf_store/data_q.h5


In [57]:
%time data_copy = pd.read_hdf('../../assets/hdf_store/data_q.h5', 'data')

CPU times: user 121 ms, sys: 299 ms, total: 420 ms
Wall time: 696 ms


In [58]:
data_copy.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5000000 entries, 2021-01-01 00:00:00 to 2030-07-05 05:19:00
Freq: min
Data columns (total 10 columns):
 #   Column  Dtype  
---  ------  -----  
 0   No0     float64
 1   No1     float64
 2   No2     float64
 3   No3     float64
 4   No4     float64
 5   No5     float64
 6   No6     float64
 7   No7     float64
 8   No8     float64
 9   No9     float64
dtypes: float64(10)
memory usage: 419.6 MB


In [59]:
# Using table_frame object of the PyTables package to work with stored data
# PyTables’ documentation @ http://www.pytables.org/
import tables as tb

In [60]:
h5 = tb.open_file('../../assets/hdf_store/data_q.h5', 'r')

In [61]:
h5

File(filename=../../assets/hdf_store/data_q.h5, title=np.str_(''), mode='r', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) np.str_('')
/data (Group) np.str_('')
/data/table (Table(np.int64(5000000),)) np.str_('')
  description := {
  "index": Int64Col(shape=(), dflt=np.int64(0), pos=0),
  "values_block_0": Float64Col(shape=(np.int64(10),), dflt=np.float64(0.0), pos=1)}
  byteorder := 'little'
  chunkshape := (np.int64(2978),)
  autoindex := True
  colindexes := {
    "index": Index(6, mediumshuffle, zlib(1)).is_csi=False}

In [62]:
h5.root.data.table[:3] # printing the first three rows of the table

array([(1609459200000000000, [100.    , 100.    , 100.    , 100.    , 100.    , 100.    , 100.    , 100.    , 100.    , 100.    ]),
       (1609459260000000000, [ 99.9705, 100.0921,  99.935 ,  99.9768, 100.0023,  99.9592,  99.9284, 100.    ,  99.999 ,  99.9573]),
       (1609459320000000000, [ 99.9965, 100.1603, 100.1046,  99.9502,  99.9782,  99.9672, 100.0414, 100.0053,  99.9949, 100.0174])],
      dtype=[('index', '<i8'), ('values_block_0', '<f8', (10,))])

In [63]:
h5.close()

In [65]:
rm ../../assets/hdf_store/data_q.h5 # deleting file