# Storing Financial Data Efficiently

### Storing DataFrame Objects

In [7]:
import pandas as pd
from sample_data import generate_sample_data

In [3]:
print(generate_sample_data(rows=5, cols=4)) # printing sample financial data set

                            No0         No1         No2         No3
2021-01-01 00:00:00  100.000000  100.000000  100.000000  100.000000
2021-01-01 00:01:00   99.959239  100.115270   99.959731  100.063988
2021-01-01 00:02:00   99.984024  100.076891   99.908000  100.143433
2021-01-01 00:03:00   99.974529   99.996956   99.866239  100.133294
2021-01-01 00:04:00   99.937704  100.062026   99.962441  100.209680


In [4]:
# creating sample data set for storing example
%time
data = generate_sample_data(rows=5e6, cols=10).round(4) # 5.000.000 rows, 10 cols

CPU times: user 2 μs, sys: 1e+03 ns, total: 3 μs
Wall time: 14.1 μs


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5000000 entries, 2021-01-01 00:00:00 to 2030-07-05 05:19:00
Freq: min
Data columns (total 10 columns):
 #   Column  Dtype  
---  ------  -----  
 0   No0     float64
 1   No1     float64
 2   No2     float64
 3   No3     float64
 4   No4     float64
 5   No5     float64
 6   No6     float64
 7   No7     float64
 8   No8     float64
 9   No9     float64
dtypes: float64(10)
memory usage: 419.6 MB


In [8]:
# opening HDFStore object on disc for writing
h5 = pd.HDFStore('../../assets/hdf_store/data.h5', 'w')

In [9]:
%time h5['data'] = data # writing data frame object to disc

CPU times: user 11.9 ms, sys: 170 ms, total: 182 ms
Wall time: 404 ms


In [10]:
h5 # printing out meta information of database file

<class 'pandas.io.pytables.HDFStore'>
File path: ../../assets/hdf_store/data.h5

In [11]:
ls -n ../../assets/hdf_store/data.*

-rw-r--r--  1 501  20  440007240 20 Sep 09:43 ../../assets/hdf_store/data.h5


In [20]:
h5.close() # closing database file

In [21]:
# reading data from the file based HDFStore object
h5 = pd.HDFStore('../../assets/hdf_store/data.h5', 'r')

In [22]:
%time data_copy = h5['data'] # reading from stored file

CPU times: user 168 ms, sys: 332 ms, total: 500 ms
Wall time: 1.27 s


In [23]:
data_copy.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5000000 entries, 2021-01-01 00:00:00 to 2030-07-05 05:19:00
Freq: min
Data columns (total 10 columns):
 #   Column  Dtype  
---  ------  -----  
 0   No0     float64
 1   No1     float64
 2   No2     float64
 3   No3     float64
 4   No4     float64
 5   No5     float64
 6   No6     float64
 7   No7     float64
 8   No8     float64
 9   No9     float64
dtypes: float64(10)
memory usage: 419.6 MB


In [24]:
h5.close()

In [26]:
rm ../../assets/hdf_store/data.h5 # deleting file