In [1]:
%matplotlib inline
# standard library
import itertools
import sys, os
import re
import glob

# pandas
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import dask


# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# tesnsorflow
import tensorflow as tf

# local imports
sys.path.append(os.path.join(os.getcwd(), "../src"))

import tf_utils

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

In [2]:
# test numpy to tensor function
struct = ({'X': np.array([[1, 2, 3], [4, 1, 2], [10., 1, 1]]), 'Y': np.array([1, 1, 1])}, np.array([1, 2, 3]))
tf_utils.get_tensors_and_feed_from_nested(struct)

(({'X': <tf.Tensor 'Placeholder_1:0' shape=(3, 3) dtype=float64>,
   'Y': <tf.Tensor 'Placeholder:0' shape=(3,) dtype=int64>},
  <tf.Tensor 'Placeholder_2:0' shape=(3,) dtype=int64>),
 {<tf.Tensor 'Placeholder:0' shape=(3,) dtype=int64>: array([1, 1, 1]),
  <tf.Tensor 'Placeholder_1:0' shape=(3, 3) dtype=float64>: array([[  1.,   2.,   3.],
         [  4.,   1.,   2.],
         [ 10.,   1.,   1.]]),
  <tf.Tensor 'Placeholder_2:0' shape=(3,) dtype=int64>: array([1, 2, 3])})

In [3]:
dataset, feed_dict = tf_utils.get_dataset_from_tensors(struct)
iterator = dataset.make_initializable_iterator()
elems = iterator.get_next()  # get the elems
# iterate it to see if it is any good
with tf.Session() as sess:
    # get each element of the training dataset until the end is reached
    sess.run(iterator.initializer, feed_dict=feed_dict)
    while True:
        try:
            print(sess.run(elems))
        except tf.errors.OutOfRangeError:
            break
            

({'Y': 1, 'X': array([ 1.,  2.,  3.])}, 1)
({'Y': 1, 'X': array([ 4.,  1.,  2.])}, 2)
({'Y': 1, 'X': array([ 10.,   1.,   1.])}, 3)


In [4]:
%%time
# now let's test with the csv dataset
uncached_dataset = tf_utils.build_dataset('../data/final/cleaneval/dom-full-test-0*.csv')

# iterate over it normally
elems = tf_utils.get_input_fn_from_dataset(uncached_dataset)( num_epochs=1, batch_size=2000, shuffle_buffer=0)

with tf.Session() as sess:
    # get each element of the training dataset until the end is reached
    while True:
        try:
            print(sess.run(elems))
        except tf.errors.OutOfRangeError:
            break
            

({'X': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32), 'weights': array([ 3.46447953,  3.46447953,  0.58433184, ...,  0.58433184,
        0.58433184,  0.58433184])}, array([ 1.,  1.,  0., ...,  0.,  0.,  0.], dtype=float32))
({'X': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32), 'weights': array([ 0.58433184,  0.58433184,  0.58433184, ...,  0.58433184,
        0.58433184,  0.58433184])}, array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32))
({'X': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       

In [5]:
%%time
cached_dataset, feed_dict = tf_utils.np_precache_dataset(uncached_dataset, len(dd.read_csv('../data/final/cleaneval/dom-full-test-0*.csv')))

# iterate to see the same results
input_fn, init_hook = tf_utils.get_input_fn_from_dataset(cached_dataset, feed_dict=feed_dict)
elems = input_fn(num_epochs=1, batch_size=2000, shuffle_buffer=0)

with tf.Session() as sess:
    # run the hook manually
    init_hook.after_create_session(sess, None)  # the coord is irrelevant here
    while True:
        try:
            print(sess.run(elems))
        except tf.errors.OutOfRangeError:
            break

({'X': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32), 'weights': array([ 3.46447953,  3.46447953,  0.58433184, ...,  0.58433184,
        0.58433184,  0.58433184])}, array([ 1.,  1.,  0., ...,  0.,  0.,  0.], dtype=float32))
({'X': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32), 'weights': array([ 0.58433184,  0.58433184,  0.58433184, ...,  0.58433184,
        0.58433184,  0.58433184])}, array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32))
({'X': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       

In [6]:
%%time
# this should now be instant
elems = input_fn(num_epochs=1, batch_size=2000, shuffle_buffer=0)

with tf.Session() as sess:
    # run the hook manually
    init_hook.after_create_session(sess, None)  # the coord is irrelevant here
    while True:
        try:
            print(sess.run(elems))
        except tf.errors.OutOfRangeError:
            break

({'X': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32), 'weights': array([ 3.46447953,  3.46447953,  0.58433184, ...,  0.58433184,
        0.58433184,  0.58433184])}, array([ 1.,  1.,  0., ...,  0.,  0.,  0.], dtype=float32))
({'X': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32), 'weights': array([ 0.58433184,  0.58433184,  0.58433184, ...,  0.58433184,
        0.58433184,  0.58433184])}, array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32))
({'X': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       

In [7]:
# try the helper
input_fn, init_hook = tf_utils.input_fn_from_csv('../data/final/cleaneval/dom-full-test-*.csv', precache=True)
elems = input_fn(num_epochs=1, batch_size=2000, shuffle_buffer=1000)


with tf.Session() as sess:
    # run the hook manually
    init_hook.after_create_session(sess, None)  # the coord is irrelevant here
    while True:
        try:
            print(sess.run(elems))
        except tf.errors.OutOfRangeError:
            break

({'X': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32), 'weights': array([ 0.58713381,  0.58713381,  0.58713381, ...,  0.58713381,
        0.58713381,  0.58713381])}, array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32))
({'X': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32), 'weights': array([ 0.58713381,  0.58713381,  0.58713381, ...,  0.58713381,
        3.36915021,  0.58713381])}, array([ 0.,  0.,  0., ...,  0.,  1.,  0.], dtype=float32))
({'X': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       

In [10]:
%%time
input_fn_a, init_hook = tf_utils.input_fn_from_csv('../data/final/cleaneval/dom-full-test-*.csv', precache=True)
input_fn_b, init_hook = tf_utils.input_fn_from_csv('../data/final/cleaneval/dom-full-train-*.csv', precache=True)
input_fn_c, init_hook = tf_utils.input_fn_from_csv('../data/final/cleaneval/dom-full-validation-*.csv', precache=True)

 

The above code was just to test the memory usage. The result are that is uses **`4GB`** of data for a dataset wit the size of `1.1`. This is not ideal, but negligeable on the current system.

In [9]:
%%time
input_fn_a, init_hook = tf_utils.input_fn_from_csv('../data/final/dragnet/dom-full-test-*.csv', precache=True)
input_fn_b, init_hook = tf_utils.input_fn_from_csv('../data/final/dragnet/dom-full-train-*.csv', precache=True)
input_fn_c, init_hook = tf_utils.input_fn_from_csv('../data/final/dragnet/dom-full-validation-*.csv', precache=True)

CPU times: user 4h 46min 35s, sys: 6min 30s, total: 4h 53min 5s
Wall time: 30min 24s
