In [1]:
import pandas as pd
from pymongo import MongoClient
import numpy as np
import pymongoarrow
from pymongoarrow.monkey import patch_all
import time

In [2]:
import sys

In [3]:
patch_all()

In [4]:
# Initialize mongo connector object with ip adress
client = MongoClient('localhost')
# Get reference to existing database testDB
db = client.testDB
# Authentication within database
# db.authenticate('', 'zbtMongo!', source='admin')
collection = db.test_time_series

In [5]:
rows = int(1e5)
cols = 100
# Create test data
array = np.random.rand(rows, cols)

In [6]:
df = pd.DataFrame(array, index=pd.date_range('2000-01-01', freq='s', periods=rows), columns=['col_' + str(i) for i in range(cols)])
df['time'] = df.index

In [8]:
time_start = time.time()
pymongoarrow.api.write(collection, df)
time_end = time.time()
print(time_end - time_start)

7.790128707885742


In [9]:
time_start = time.time()
df_retrieved = collection.find_pandas_all({})
time_end = time.time()
print(time_end - time_start)

3.7498583793640137


In [10]:
# Retrieve data again
time_start = time.time()
df_retrieved_subset = collection.find_pandas_all({'col_0': {'$gt': 0.5}})
time_end = time.time()
print(time_end - time_start)

2.401934862136841


In [11]:
df_retrieved.size

20200000

In [12]:
df.size

10100000

In [13]:
time_start = time.time()
df_retrieved_normal_way = list(collection.find({}))
time_end = time.time()
print(time_end - time_start)

3.1924328804016113


In [15]:
hdf_path = 'hdf.h5'

In [29]:
time_start = time.time()
df.to_hdf(hdf_path, key='a', complevel=0, mode='w')
time_end = time.time()
print(time_end - time_start)

0.13539433479309082


In [19]:
time_start = time.time()
df_from_hdf5 = pd.read_hdf(hdf_path, key='a', mode='r+')
time_end = time.time()
print(time_end - time_start)

0.20710015296936035


In [147]:
df_from_hdf5.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_91,col_92,col_93,col_94,col_95,col_96,col_97,col_98,col_99,time
2000-01-01 00:00:00,0.880185,0.341446,0.530834,0.927703,0.913664,0.435048,0.910705,0.326699,0.071028,0.102509,...,0.473896,0.730774,0.61705,0.992128,0.163981,0.472986,0.225467,0.636388,0.034135,2000-01-01 00:00:00
2000-01-01 00:00:01,0.605338,0.308805,0.527641,0.910448,0.622999,0.061641,0.701104,0.741101,0.654573,0.423729,...,0.664549,0.973825,0.388033,0.36777,0.169702,0.75839,0.500432,0.423932,0.910362,2000-01-01 00:00:01
2000-01-01 00:00:02,0.79294,0.788402,0.529808,0.493923,0.030398,0.003776,0.236131,0.79075,0.383832,0.992554,...,0.371189,0.292284,0.893308,0.045706,0.759019,0.877136,0.452222,0.058064,0.646174,2000-01-01 00:00:02
2000-01-01 00:00:03,0.371363,0.012019,0.456096,0.884868,0.994874,0.231378,0.133658,0.72262,0.497263,0.909484,...,0.415123,0.391233,0.889884,0.075738,0.807016,0.566469,0.980149,0.647851,0.662446,2000-01-01 00:00:03
2000-01-01 00:00:04,0.244592,0.98451,0.111222,0.73648,0.590262,0.667714,0.13139,0.043353,0.679059,0.193599,...,0.050987,0.898388,0.174978,0.244764,0.635364,0.60434,0.087393,0.575411,0.411274,2000-01-01 00:00:04


In [148]:
time_start = time.time()
store = pd.HDFStore(hdf_path, key='a')
time_end = time.time()
print(time_end - time_start)

0.00989532470703125


In [149]:
time_start = time.time()
df_from_hdf5 = store.get(key='a')
time_end = time.time()
print(time_end - time_start)

0.045256614685058594


In [150]:
hdf_path_table = 'hdf_table.h5'

In [151]:
time_start = time.time()
df.to_hdf(hdf_path_table, key='a', mode='w', data_columns=True,
          format='table')
time_end = time.time()
print(time_end - time_start)

3.8115391731262207


In [152]:
time_start = time.time()
df_from_hdf5_subset = pd.read_hdf(hdf_path_table,'a',where='col_0 > 0.5')
time_end = time.time()
print(time_end - time_start)

0.22289276123046875


In [153]:
df_from_hdf5_subset.size

5044546

In [154]:
df_from_hdf5_subset.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_91,col_92,col_93,col_94,col_95,col_96,col_97,col_98,col_99,time
2000-01-01 00:00:00,0.880185,0.341446,0.530834,0.927703,0.913664,0.435048,0.910705,0.326699,0.071028,0.102509,...,0.473896,0.730774,0.61705,0.992128,0.163981,0.472986,0.225467,0.636388,0.034135,2000-01-01 00:00:00
2000-01-01 00:00:01,0.605338,0.308805,0.527641,0.910448,0.622999,0.061641,0.701104,0.741101,0.654573,0.423729,...,0.664549,0.973825,0.388033,0.36777,0.169702,0.75839,0.500432,0.423932,0.910362,2000-01-01 00:00:01
2000-01-01 00:00:02,0.79294,0.788402,0.529808,0.493923,0.030398,0.003776,0.236131,0.79075,0.383832,0.992554,...,0.371189,0.292284,0.893308,0.045706,0.759019,0.877136,0.452222,0.058064,0.646174,2000-01-01 00:00:02
2000-01-01 00:00:05,0.806128,0.777692,0.020415,0.549179,0.912302,0.517999,0.173441,0.926867,0.690737,0.071964,...,0.114294,0.390363,0.608217,0.584343,0.318525,0.865685,0.950093,0.543761,0.040014,2000-01-01 00:00:05
2000-01-01 00:00:08,0.800815,0.881069,0.204519,0.856634,0.342663,0.8778,0.12174,0.169093,0.276134,0.341765,...,0.297871,0.792464,0.759755,0.587354,0.526589,0.656214,0.288808,0.607831,0.527161,2000-01-01 00:00:08


In [155]:
df.size

10100000