In [1]:
from zfstools import connection
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import fastparquet
import tqdm
import os

In [4]:
%%bash
du -sh /mnt/sdb/forex

2.2G	/mnt/sdb/forex


### RESET

In [78]:
%%bash
zfs list -t snapshot -o name | tail -n +2 | xargs -I {} zfs destroy {}
rm /tss/*

### Helper Functions

In [2]:
def read_test():
    paths = [os.path.join('/tss/.zfs/snapshot/1', f) for f in os.listdir('/tss/.zfs/snapshot/1')]
    
    def full_series():
        for p in tqdm.tqdm(paths):
            df = pq.read_table(p).to_pandas()
    
    def last_value():
        for p in tqdm.tqdm(paths):
            pq_file = pq.ParquetFile(p)
            last = pq_file.read_row_group(pq_file.num_row_groups - 1).to_pandas()[-1:]
            
    def all_values_one_day(day):
        for p in tqdm.tqdm(paths):
            df = pq.read_table(p).to_pandas()
            df_one_day = df[df.seriesdate.str.startswith(day)]
            
    full_series()
    last_value()
    all_values_one_day('20090701')
    
# Splits a df into n+1 parts where the first part consist of the first tot-n rows
def split_df(df, n):
    return [df[:-n]] + [df[n+i:n+i+1] for i in range(n)]

### Globals

In [3]:
forex_loc = '/mnt/sdb/forex'
paths = [os.path.join(forex_loc, f) for f in os.listdir(forex_loc)]
zfs = connection.ZFSConnection()
names = ['symbol', 'seriesdate', 'low', 'high']

n = 100

In [4]:
dfs = [df for p in paths for df in np.array_split(pd.read_csv(p, names=names), 400)]
for i, df in zip(range(len(dfs)), dfs): df.symbol = i
valueses = map(lambda x: split_df(x, n), dfs)
sum(map(lambda x: len(x), dfs))

50541046

In [20]:
class TimeSeriesStoreSingleThread(object):
    def __init__(self):
        self.max_row_groups = 50

    def write(self, dfs, pid):
        for df_i, df in tqdm.tqdm(zip(range(len(dfs)), dfs)):
            fastparquet.write('/tss/{}.parquet'.format(df_i), df, file_scheme='simple')
        zfs.snapshot_recursively('tss', pid)
    
    def append(self, dfs, pid):
        for df_i, df in zip(range(len(dfs)), dfs):
            path = '/tss/{}.parquet'.format(df_i)
            if len(fastparquet.api.ParquetFile(path).row_groups) > self.max_row_groups:
                fastparquet.write(path, fastparquet.api.ParquetFile(path).to_pandas(), file_scheme='simple')
            fastparquet.write(path, df, file_scheme='simple', append=True)
        zfs.snapshot_recursively('tss', pid)

In [16]:
store = TimeSeriesStoreSingleThread()

In [17]:
store.write([v[0] for v in valueses], 0)

100%|██████████| 9600/9600 [00:50<00:00, 190.32it/s]


In [18]:
_ = [store.append([v[i] for v in valueses], i) for i in tqdm.tqdm(range(1, 2))]

100%|██████████| 1/1 [00:40<00:00, 40.82s/it]


In [None]:
_ = [store.append([v[i] for v in valueses], i) for i in tqdm.tqdm(range(2, 5))]

In [67]:
read_test()

100%|██████████| 9600/9600 [00:17<00:00, 556.37it/s]
100%|██████████| 9600/9600 [00:10<00:00, 874.68it/s]
100%|██████████| 9600/9600 [00:38<00:00, 246.48it/s]


In [68]:
from threading import Thread

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in xrange(0, len(seq), size))

class TimeSeriesStoreMultithread(object):
    def __init__(self):
        self.max_row_groups = 5
        self.batch_size = 1000
        
    def __write__(self, idx_dfs):
        for df_i, df in idx_dfs:
            fastparquet.write('/tss/{}.parquet'.format(df_i), df, file_scheme='simple')
    
    def __append__(self, idx_dfs):
        for df_i, df in idx_dfs:
            path = '/tss/{}.parquet'.format(df_i)
            if len(fastparquet.api.ParquetFile(path).row_groups) > self.max_row_groups:
                fastparquet.write(path, fastparquet.api.ParquetFile(path).to_pandas(), file_scheme='simple')
            fastparquet.write(path, df, file_scheme='simple', append=True)

    def write(self, dfs, pid):
        dfs_ids = zip(range(len(dfs)), dfs)
        chunks = chunker(dfs_ids, self.batch_size)
        ts = [Thread(target=self.__write__, args=[chunk]) for chunk in chunks]
        _ = [t.start() for t in ts]
        _ = [t.join() for t in ts]
        zfs.snapshot_recursively('tss', pid)

    def append(self, dfs, pid):
        dfs_ids = zip(range(len(dfs)), dfs)
        chunks = chunker(dfs_ids, self.batch_size)
        ts = [Thread(target=self.__append__, args=[chunk]) for chunk in chunks]
        _ = [t.start() for t in ts]
        _ = [t.join() for t in ts]
        zfs.snapshot_recursively('tss', pid)

In [69]:
store = TimeSeriesStoreMultithread()

In [71]:
%%timeit -n1 -r1
store.write([v[0] for v in valueses], 0)

1 loop, best of 1: 1min 41s per loop


In [72]:
%%timeit -n1 -r1
_ = [store.append([v[i] for v in valueses], i) for i in tqdm.tqdm(range(1, 2))]

100%|██████████| 1/1 [01:33<00:00, 93.41s/it]

1 loop, best of 1: 1min 33s per loop





In [73]:
read_test()

100%|██████████| 9600/9600 [00:24<00:00, 384.25it/s]
100%|██████████| 9600/9600 [00:10<00:00, 955.37it/s]
100%|██████████| 9600/9600 [00:38<00:00, 248.48it/s]


In [None]:
import pyspark
spark = pyspark.sql.session.SparkSession.Builder() \
    .config('spark.executor.memory', '16g') \
    .config('spark.driver.memory', '16g') \
    .getOrCreate()
sc = spark.sparkContext

In [None]:
def __write__(df, df_i):
    fastparquet.write('/tss/{}.parquet'.format(df_i), df, file_scheme='simple')
    return True
    
def __append__(idx_dfs):
    raise NotIplementedError()
        
class TimeSeriesStoreSpark(object):
    
    def write(self, dfs, pid):
        r = rdd.zipWithIndex().map(lambda (df, df_i): __write__(df, df_i)).collect()
        zfs.snapshot_recursively('tss', pid)
        return r

    def append(self, dfs, pid):
        raise NotImplementedError()

In [None]:
store = TimeSeriesStoreSpark()

In [None]:
forex_loc = '/mnt/sdb/forex'
paths = [os.path.join(forex_loc, f) for f in os.listdir(forex_loc)]

valueses = sc.parallelize(paths, len(paths)) \
    .map(lambda p: pd.read_csv(p, names=['symbol', 'seriesdate', 'low', 'high'])) \
    .flatMap(lambda df: np.array_split(df, 250)) \
    .map(lambda x: split_df(x, n))

In [None]:
dfs = valueses.map(lambda dfs dfs[0]).cache()
_ = dfs.count()     # cache results
store.write(dfs, 0) # 2 minutes ???

In [12]:
class TimeSeriesBatched(object):
    
    def __init__(self):
        self.max_row_groups = 100
    
    def write(self, big_df, pid):
        fastparquet.write('/tss/df.parquet', big_df, file_scheme='simple', partition_on=['symbol'])
        zfs.snapshot_recursively('tss', pid)

    def append(self, big_df, pid):
        path = '/tss/df.parquet'
        if len(fastparquet.api.ParquetFile(path).row_groups) > self.max_row_groups:
            fastparquet.write(path, fastparquet.api.ParquetFile(path).to_pandas(), file_scheme='simple')
        fastparquet.write(path, big_df, file_scheme='simple', partition_on=['symbol'], append=True)
        zfs.snapshot_recursively('tss', pid)


In [13]:
store = TimeSeriesBatched()
big_df = pd.concat([v[0] for v in valueses])
first_append = pd.concat([v[1] for v in valueses])

In [79]:
%%timeit -n1 -r1
store.write(big_df, 0)

1 loop, best of 1: 26.3 s per loop


In [80]:
%%timeit -n1 -r1
store.append(first_append, 1)

1 loop, best of 1: 347 ms per loop


In [81]:
def run():
    for i in tqdm.tqdm(xrange(2, n)):
        df = pd.concat([v[i] for v in valueses])
        store.append(first_append, i)
        del df

run()

100%|██████████| 98/98 [03:55<00:00,  2.40s/it]


In [82]:
%%bash
zfs list -t snapshot

NAME     USED  AVAIL  REFER  MOUNTPOINT
tss@0   41.5K      -   748M  -
tss@1   97.5K      -   748M  -
tss@2    116K      -   749M  -
tss@3     36K      -   749M  -
tss@4     55K      -   749M  -
tss@5   72.5K      -   750M  -
tss@6   90.5K      -   750M  -
tss@7    109K      -   750M  -
tss@8     28K      -   750M  -
tss@9     47K      -   751M  -
tss@10  64.5K      -   751M  -
tss@11    82K      -   751M  -
tss@12   101K      -   752M  -
tss@13   118K      -   752M  -
tss@14  39.5K      -   752M  -
tss@15  57.5K      -   753M  -
tss@16    76K      -   753M  -
tss@17    82K      -   753M  -
tss@18    99K      -   754M  -
tss@19   118K      -   754M  -
tss@20    38K      -   754M  -
tss@21  55.5K      -   754M  -
tss@22  73.5K      -   755M  -
tss@23  91.5K      -   755M  -
tss@24   110K      -   755M  -
tss@25  30.5K      -   756M  -
tss@26  48.5K      -   756M  -
tss@27  66.5K      -   756M  -
tss@28  84.5K      -   757M  -
tss@29   102K      -   757M  -
tss@30   122K      -   757M  -

In [None]:
def read_test():
    path = '/tss/.zfs/snapshot/1/df.parquet'
    
    def full_series():
        for p in tqdm.tqdm(paths):
            df = pq.read_table(p).to_pandas()
    
    def last_value():
        for p in tqdm.tqdm(paths):
            pq_file = pq.ParquetFile(p)
            last = pq_file.read_row_group(pq_file.num_row_groups - 1).to_pandas()[-1:]
            
    def all_values_one_day(day):
        for p in tqdm.tqdm(paths):
            df = pq.read_table(p).to_pandas()
            df_one_day = df[df.seriesdate.str.startswith(day)]
            
    full_series()
    last_value()
    all_values_one_day('20090701')

In [5]:
path = '/tss/.zfs/snapshot/1/df.parquet'
pq_file = pq.ParquetFile(path)

In [9]:
%%timeit -n1 -r1
tmp = pq.read_pandas(path).to_pandas()
del tmp

1 loop, best of 1: 5.99 s per loop


In [29]:
path = '/tss/df2.parquet'

ds = pq.ParquetDataset(path)

In [33]:
pq_file = fastparquet.ParquetFile(path)

In [34]:
pq_file.cats

{}

In [31]:
%%timeit -n1 -r1
fastparquet.write('/tss/df2.parquet', big_df, file_scheme='simple', partition_on=['symbol'])

1 loop, best of 1: 20 s per loop


In [35]:
pq

<module 'pyarrow.parquet' from '/home/mikeokslonger/anaconda2/lib/python2.7/site-packages/pyarrow/parquet.pyc'>