In [None]:
# Import required modules
import sys
import pandas as pd
from pymongo import MongoClient
import numpy as np
import pymongoarrow
from pymongoarrow.monkey import patch_all
import time
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as mticker

In [None]:
# Patch all pymongoarrow commands into pymongo api
patch_all()

In [None]:
# Initialize mongo connector object with ip adress
client = MongoClient('localhost')
# Reference to existing database testDB
db = client.testDB
# Create time-series collection
db.time_series_collection.drop()
db.create_collection('time_series_collection', timeseries={'timeField': 'time'})
# Reference to time-series collection
mongo_collection = db.time_series_collection

In [None]:
# Generate synthetic data
data_list = []
shape_list = []
n = 5
m = 2
res_factor = 1
for i in np.logspace(0, n, res_factor*n+1, dtype=int):
    data_sub_list = []
    shape_sub_list = []
    for j in np.logspace(0, m, res_factor*m+1, dtype=int):
        # Numpy array
        array = np.random.rand(i, j)
        str_col = pd.Series(['abcd' for k in range(i)]).astype('O')
        # Convert to DataFrame with time column
        df = pd.DataFrame(array, columns=['col_' + str(k) for k in range(j)])
        df['time'] = pd.date_range('2000-01-01', freq='s', periods=i)
        df['strings'] = str_col
        data_sub_list.append(df)
        shape_sub_list.append(array.shape)
    data_list.append(data_sub_list)
    shape_list.append(shape_sub_list)
x = np.asarray([[item[0] for item in sub_list] for sub_list in shape_list])   
y = np.asarray([[item[1] for item in sub_list] for sub_list in shape_list])  

In [None]:
# Measure writing and read times of dataframes into clean mongoDB time series collection
mongo_write_timing = []
mongo_read_timing = []
for i in range(len(data_list)):
    sub_list_write = []
    sub_list_read = []
    for j in range(len(data_list[i])):
        # Clear mongoDB collection
        mongo_collection.delete_many({})
        # Write new data into collection
        time_start = time.time()
        pymongoarrow.api.write(mongo_collection, data_list[i][j])
        time_stop = time.time()
        sub_list_write.append(time_stop - time_start)
        # Read from HDFStore object with key
        time_start = time.time()
        test_data = mongo_collection.find_pandas_all({'col_0': {'$gt': 0.5}})
        #print(sys.getsizeof(test_data))
        time_stop = time.time()
        sub_list_read.append(time_stop - time_start)
    mongo_write_timing.append(sub_list_write)
    mongo_read_timing.append(sub_list_read)
mongo_write_timing_array = np.asarray(mongo_write_timing)
mongo_read_timing_array = np.asarray(mongo_read_timing)

In [None]:
# Create HDF5 file and get reference file object
hdf_path = 'hdf.h5'
key = 'test'

In [None]:
# Measure writing and read times of dataframes into hdf5 files
# Method 1: Using the HDFStore object, to test speeds with/without explicitly closing/opening the file
hdf_store = pd.HDFStore(hdf_path, key=key)
hdf_store.put(key, data_list[0][0])

hdf_write_timing = []
hdf_read_timing = []
for i in range(len(data_list)):
    sub_list_write = []
    sub_list_read = []
    for j in range(len(data_list[i])):
        # Clear HDF subdirectory (key)
        #hdf_store = pd.HDFStore(hdf_path, key=key)
        hdf_store.remove(key)
        #hdf_store.close()
        # Put new data into subdirectory (key)
        time_start = time.time()
        #hdf_store = pd.HDFStore(hdf_path, key=key)
        hdf_store.put(key, data_list[i][j], format='table', data_columns=['col_0'])
        #hdf_store.close()
        time_stop = time.time()
        sub_list_write.append(time_stop - time_start)
        #print(sys.getsizeof(hdf_store))
        # Read from HDFStore object with key
        time_start = time.time()
        #hdf_store = pd.HDFStore(hdf_path, key=key)
        test_data = hdf_store.select(key, where='col_0 > 0.5')
        # hdf_store.close()
        # print(sys.getsizeof(test_data))
        time_stop = time.time()
        sub_list_read.append(time_stop - time_start)
    hdf_write_timing.append(sub_list_write)
    hdf_read_timing.append(sub_list_read)
hdf_store.close()
hdf_write_timing_array = np.asarray(hdf_write_timing)
hdf_read_timing_array = np.asarray(hdf_read_timing)

In [None]:
# Measure writing and read times of dataframes into HDF5 files
# Method 2: Using the pandas wrapper functions to read and save HDF5 files
hdf_write_timing = []
hdf_read_timing = []
for i in range(len(data_list)):
    sub_list_write = []
    sub_list_read = []
    for j in range(len(data_list[i])):
        # Save data as HDF5 file
        df = data_list[i][j]
        time_start = time.time()
        df.to_hdf(hdf_path, key=key, mode='w', data_columns=['col_0'], format='table')
        time_stop = time.time()
        sub_list_write.append(time_stop - time_start)
        # Read from HDFStore object with key
        time_start = time.time()
        df = pd.read_hdf(hdf_path, key=key, where='col_0 > 0.5')
        time_stop = time.time()
        sub_list_read.append(time_stop - time_start)
    hdf_write_timing.append(sub_list_write)
    hdf_read_timing.append(sub_list_read)
hdf_write_timing_array = np.asarray(hdf_write_timing)
hdf_read_timing_array = np.asarray(hdf_read_timing)

In [None]:
# Make plots for writing comparison
fig, ax = plt.subplots()
for i in range(len(x[0])):
    ax.plot(x[:, i], mongo_write_timing_array[:, i], color='b', markersize=(i+1)*2, marker='o', linestyle='')
    ax.plot(x[:, i], hdf_write_timing_array[:, i], color='r', markersize=(i+1)*2, marker='o', linestyle='')
ax.set_title('Write Times')
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Number of rows / -')
ax.set_ylabel('Time / s')
ax.legend(['Mongo', 'HDF5'])
plt.show()

In [None]:
# Make plots for reading comparison
fig, ax = plt.subplots()
for i in range(len(x[0])):
    ax.plot(x[:, i], mongo_read_timing_array[:, i], color='b', markersize=(i+1)*2, marker='o', linestyle='')
    ax.plot(x[:, i], hdf_read_timing_array[:, i], color='r', markersize=(i+1)*2, marker='o', linestyle='')
ax.set_title('Read Times')
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Number of rows / -')
ax.set_ylabel('Time / s')
ax.legend(['Mongo', 'HDF5'])
plt.show()