# Storing Financial Data Efficiently

### Storing DataFrame Objects

In [7]:
import pandas as pd
from sample_data import generate_sample_data

In [3]:
print(generate_sample_data(rows=5, cols=4)) # printing sample financial data set

                            No0         No1         No2         No3
2021-01-01 00:00:00  100.000000  100.000000  100.000000  100.000000
2021-01-01 00:01:00   99.959239  100.115270   99.959731  100.063988
2021-01-01 00:02:00   99.984024  100.076891   99.908000  100.143433
2021-01-01 00:03:00   99.974529   99.996956   99.866239  100.133294
2021-01-01 00:04:00   99.937704  100.062026   99.962441  100.209680


In [4]:
# creating sample data set for storing example
%time
data = generate_sample_data(rows=5e6, cols=10).round(4) # 5.000.000 rows, 10 cols

CPU times: user 2 μs, sys: 1e+03 ns, total: 3 μs
Wall time: 14.1 μs


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5000000 entries, 2021-01-01 00:00:00 to 2030-07-05 05:19:00
Freq: min
Data columns (total 10 columns):
 #   Column  Dtype  
---  ------  -----  
 0   No0     float64
 1   No1     float64
 2   No2     float64
 3   No3     float64
 4   No4     float64
 5   No5     float64
 6   No6     float64
 7   No7     float64
 8   No8     float64
 9   No9     float64
dtypes: float64(10)
memory usage: 419.6 MB


In [8]:
# opening HDFStore object on disc for writing
h5 = pd.HDFStore('../../assets/hdf_store/data.h5', 'w')

In [9]:
%time h5['data'] = data # writing data frame object to disc

CPU times: user 11.9 ms, sys: 170 ms, total: 182 ms
Wall time: 404 ms


In [10]:
h5 # printing out meta information of database file

<class 'pandas.io.pytables.HDFStore'>
File path: ../../assets/hdf_store/data.h5

In [11]:
ls -n ../../assets/hdf_store/data.*

-rw-r--r--  1 501  20  440007240 20 Sep 09:43 ../../assets/hdf_store/data.h5


In [20]:
h5.close() # closing database file

In [21]:
# reading data from the file based HDFStore object
h5 = pd.HDFStore('../../assets/hdf_store/data.h5', 'r')

In [22]:
%time data_copy = h5['data'] # reading from stored file

CPU times: user 168 ms, sys: 332 ms, total: 500 ms
Wall time: 1.27 s


In [23]:
data_copy.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5000000 entries, 2021-01-01 00:00:00 to 2030-07-05 05:19:00
Freq: min
Data columns (total 10 columns):
 #   Column  Dtype  
---  ------  -----  
 0   No0     float64
 1   No1     float64
 2   No2     float64
 3   No3     float64
 4   No4     float64
 5   No5     float64
 6   No6     float64
 7   No7     float64
 8   No8     float64
 9   No9     float64
dtypes: float64(10)
memory usage: 419.6 MB


In [24]:
h5.close()

In [26]:
rm ../../assets/hdf_store/data.h5 # deleting file

In [32]:
# alternative way of storing a HDFStore that allows to append new data to the table
# and for searching over the table
# pandas.DataFrame.to_hdf
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_hdf.html
%time data.to_hdf('../../assets/hdf_store/data_q.h5', key='data', mode='w', format='table')

CPU times: user 7.04 s, sys: 3.41 s, total: 10.4 s
Wall time: 19.7 s


In [33]:
ls -n ../../assets/hdf_store/data_q.*

-rw-r--r--  1 501  20  446953369 20 Sep 10:08 ../../assets/hdf_store/data_q.h5


In [30]:
%time data_copy = pd.read_hdf('../../assets/hdf_store/data_q.h5', 'data')

CPU times: user 130 ms, sys: 293 ms, total: 423 ms
Wall time: 836 ms


In [31]:
data_copy.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5000000 entries, 2021-01-01 00:00:00 to 2030-07-05 05:19:00
Freq: min
Data columns (total 10 columns):
 #   Column  Dtype  
---  ------  -----  
 0   No0     float64
 1   No1     float64
 2   No2     float64
 3   No3     float64
 4   No4     float64
 5   No5     float64
 6   No6     float64
 7   No7     float64
 8   No8     float64
 9   No9     float64
dtypes: float64(10)
memory usage: 419.6 MB


In [34]:
# Using table_frame object of the PyTables package to work with stored data
# PyTables’ documentation @ http://www.pytables.org/
import tables as tb

In [35]:
h5 = tb.open_file('../../assets/hdf_store/data_q.h5', 'r')

In [36]:
h5

File(filename=../../assets/hdf_store/data_q.h5, title=np.str_(''), mode='r', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) np.str_('')
/data (Group) np.str_('')
/data/table (Table(np.int64(5000000),)) np.str_('')
  description := {
  "index": Int64Col(shape=(), dflt=np.int64(0), pos=0),
  "values_block_0": Float64Col(shape=(np.int64(10),), dflt=np.float64(0.0), pos=1)}
  byteorder := 'little'
  chunkshape := (np.int64(2978),)
  autoindex := True
  colindexes := {
    "index": Index(6, mediumshuffle, zlib(1)).is_csi=False}

In [38]:
h5.root.data.table[:3] # printing the first three rows of the table

array([(1609459200000000000, [100.    , 100.    , 100.    , 100.    , 100.    , 100.    , 100.    , 100.    , 100.    , 100.    ]),
       (1609459260000000000, [ 99.866 ,  99.9108,  99.9865, 100.0023,  99.9719,  99.9268, 100.0617, 100.0508,  99.9689,  99.9298]),
       (1609459320000000000, [ 99.8433,  99.8654, 100.0505,  99.9748,  99.9662,  99.8505, 100.0031,  99.9697,  99.9422,  99.8595])],
      dtype=[('index', '<i8'), ('values_block_0', '<f8', (10,))])

In [39]:
h5.close()

In [40]:
rm ../../assets/hdf_store/data_q.h5 # deleting file