In [103]:
import pyarrow
import pyarrow.plasma as plasma
import numpy as np
import timeit
import time
import pickle
import sys

In [37]:
NUM_STRINGS = 2*10**5
STRING_LENGTH = 20
short_numpy_array = np.array([1,2,3,4,5])
large_numpy_array = np.array([i for i in range(NUM_STRINGS)])
very_large_numpy_array = np.array([i for i in range(NUM_STRINGS * 10)])

### Run plasma store with `plasma_store -m 10000000 -s /tmp/plasma` (allocated 10MB of storage)

In [258]:
plasma_client = plasma.connect("/tmp/plasma", "", 0)

In [123]:
serialization_context = pyarrow._default_serialization_context
#set pickle and pytorch statements here?

In [74]:
t_start = time.time()
plasma_client.put(short_numpy_array)
t_end = time.time()
print t_end - t_start

0.00124001502991


In [75]:
t_start = time.time()
plasma_client.put(large_numpy_array)
t_end = time.time()
print t_end - t_start

0.00404405593872


### testing the plasma put function line by line

In [80]:
def line_testing_pyarrow_put(arr):
    t_total_start = time.time()
    
    t_start = time.time()
    serialized = pyarrow.serialize(arr, serialization_context)
    t_end = time.time()
    print "serialization time: " + '\t' + '\t' + str(t_end - t_start)

    t_start = time.time()
    target_id = plasma.ObjectID(str(np.random.randint(100000)))
    t_end = time.time()
    print "oid selection time: " + '\t' + '\t' + str(t_end - t_start)

    t_start = time.time()
    buffer = plasma_client.create(target_id, serialized.total_bytes)
    t_end = time.time()
    print "buffer creation time: " + '\t' + '\t' + str(t_end - t_start)

    t_start = time.time()
    stream = pyarrow.FixedSizeBufferWriter(buffer)
    t_end = time.time()
    print "initialize buffer writer time: " + '\t' + str(t_end - t_start)

    t_start = time.time()
    serialized.write_to(stream)
    t_end = time.time()
    print "write to stream time: " + '\t' + '\t' + str(t_end - t_start)
    
    t_total_end = time.time()
    print "total pyarrow put time: " + '\t' + str(t_total_end - t_total_start)

In [81]:
line_testing_pyarrow_put(short_numpy_array)

serialization time: 		7.39097595215e-05
oid selection time: 		2.19345092773e-05
buffer creation time: 		0.000393867492676
initialize buffer writer time: 	4.05311584473e-06
write to stream time: 		4.98294830322e-05
total pyarrow put time: 	0.00103282928467


In [82]:
line_testing_pyarrow_put(large_numpy_array)

serialization time: 		0.000170946121216
oid selection time: 		2.59876251221e-05
buffer creation time: 		0.00133895874023
initialize buffer writer time: 	5.00679016113e-06
write to stream time: 		0.00197410583496
total pyarrow put time: 	0.0040020942688


In [67]:
#why is initializing the buffer the most expensive part?

## Memory Overhead Experiments

In [254]:
NUM_STRINGS = 131060 # to get exactly 1 MB array
one_meg_arr = np.array([i for i in range(NUM_STRINGS)])
float(sys.getsizeof(one_meg_arr))/1024.0/1024.0

1.0

In [159]:
for i in range(6):
    plasma_client.put(one_meg_arr)

In [160]:
plasma_client.put(one_meg_arr)

ObjectID(c2d2d33241ad32b654ca09d4420044835a573d5c)

In [None]:
#6 1MB arrays fit in 10MB store -> 6MB fits but 7MB doesn't

In [204]:
NUM_STRINGS = 12788 # to get exactly 100KB array
hundred_kilo_arr = np.array([i for i in range(NUM_STRINGS)])
float(sys.getsizeof(hundred_kilo_arr))/1024.0

100.0

In [205]:
for i in range(75):
    plasma_client.putr (hundred_kilo_arr)

In [209]:
plasma_client.put(hundred_kilo_arr)

ObjectID(321095db89644bbc46b57ad33beef51131bbf2be)

In [None]:
#79 100KB arrays fit in 10MB store -> 7.9MB fits but 8MB doesn't

In [216]:
NUM_STRINGS = 116 # to get exactly 100KB array
one_kilo_arr = np.array([i for i in range(NUM_STRINGS)])
float(sys.getsizeof(one_kilo_arr))/1024.0

1.0

In [253]:
for i in range(10):
    plasma_client.put(one_kilo_arr)

In [222]:
plasma_client.put(one_kilo_arr)

ObjectID(c9f998ae4b74903963cc14dc3971a90df0f7e68e)

In [None]:
#4770 1KB arrays fit in 10MB store -> 4.77MB data fits in 10MB but 4.78MB doesn't

## Huge Page Support Experiments

### run `plasma_store -s /tmp/plasma -m 10000000000 -d /tmp/hugepages -h` to start plasma store with huge page support - still working on getting this to work on Mac

In [256]:
NUM_STRINGS = 12788 # to get exactly 100KB array
hundred_kilo_arr = np.array([i for i in range(NUM_STRINGS)])
float(sys.getsizeof(hundred_kilo_arr))/1024.0

100.0