# CSV to TFREcords convertor
This notebook tries to implement a method to convert from a fragmented csv dataset to a large TFRecrds one.

In [105]:
%matplotlib inline
# standard library
import itertools
import sys, os
import re
import glob

# pandas
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import dask


# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# tesnsorflow
import tensorflow as tf


# local imports
sys.path.append(os.path.join(os.getcwd(), "../src"))

from tf_utils import make_csv_pipeline, make_csv_col_tensors, make_csv_decoder

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

In [2]:
def get_csv_tensors(csv_pattern):
    """Returns the queued tensors generated from the columns of """
    return make_csv_col_tensors(csv_pattern=csv_pattern, shuffle=False, num_epochs=1, csv_decoder_kwargs={'convert_ints': True})

In [3]:
def iterate_dataset(tensors):
    """Yields every row from the CSV as their corresponding tensors"""
    with tf.Session() as sess:
        # variables must be initialized otherwise it fails 
        
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)
        
        try:
            while not coord.should_stop():
                print(sess.run(tensors))
        except Exception as e:
            # Report exceptions to the coordinator.
            coord.request_stop(e)
        finally:
            # Terminate as usual. It is safe to call `coord.request_stop()` twice.
            coord.request_stop()
            coord.join(threads)

In [4]:
features, labels = get_feature_and_label_tens('../data/final/cleaneval/dom-full-train.csv', ['content_label'], excluded_cols=['path', 'url'])
features, labels

(<tf.Tensor 'stack:0' shape=(1572,) dtype=float32>,
 <tf.Tensor 'Select:0' shape=() dtype=float32>)

In [8]:
tens_dict = get_csv_tensors('../data/final/cleaneval/dom-full-train.csv')
for tensors in itertools.islice(iterate_dataset(tens_dict), 10):
    print(tensors)

{'depth': 3.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 69.0, 'class_len': 0.0, 'no_children': 67.0, 'text_len': 8594.0, 'descendant1_no_nodes': 67.0, 'descendant1_no_children_avg': 1.0149254, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.29850745, 'descendant1_class_len_avg': 2.3731344, 'descendant1_text_len_avg': 96.223877, 'descendant2_no_nodes': 68.0, 'descendant2_no_children_avg': 0.54411763, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 79.955879, 'descendant3_no_nodes': 37.0, 'descendant3_no_children_avg': 0.32432431, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 69.24324, 'descendant4_no_nodes': 12.0, 'descendant4_no_children_avg': 0.66666669, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 2.0, 'descendant5_no_no

{'depth': 5.0, 'sibling_pos': 2.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 1.0, 'text_len': 5.0, 'descendant1_no_nodes': 1.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 5.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_len

{'depth': 10.0, 'sibling_pos': 0.0, 'no_classes': 1.0, 'id_len': 0.0, 'class_len': 13.0, 'no_children': 0.0, 'text_len': 6.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_l

{'depth': 10.0, 'sibling_pos': 36.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 643.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id

{'depth': 7.0, 'sibling_pos': 0.0, 'no_classes': 1.0, 'id_len': 0.0, 'class_len': 18.0, 'no_children': 0.0, 'text_len': 0.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_le

{'depth': 6.0, 'sibling_pos': 1.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 8.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_len

{'depth': 11.0, 'sibling_pos': 19.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 1.0, 'text_len': 178.0, 'descendant1_no_nodes': 1.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 45.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_i

{'depth': 17.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 53.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_l

{'depth': 4.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 7.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 0.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_len

{'depth': 16.0, 'sibling_pos': 14.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 0.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_l

{'depth': 14.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 6.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_le

{'depth': 8.0, 'sibling_pos': 6.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 1.0, 'text_len': 20.0, 'descendant1_no_nodes': 1.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 20.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_l

{'depth': 4.0, 'sibling_pos': 1.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 101.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_l

{'depth': 20.0, 'sibling_pos': 7.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 1.0, 'text_len': 114.0, 'descendant1_no_nodes': 1.0, 'descendant1_no_children_avg': 2.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 112.0, 'descendant2_no_nodes': 2.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_i

{'depth': 4.0, 'sibling_pos': 12.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 0.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_le

{'depth': 9.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 1.0, 'text_len': 0.0, 'descendant1_no_nodes': 1.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_len

{'depth': 5.0, 'sibling_pos': 120.0, 'no_classes': 1.0, 'id_len': 0.0, 'class_len': 11.0, 'no_children': 0.0, 'text_len': 313.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_i

{'depth': 11.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 12.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_l

{'depth': 12.0, 'sibling_pos': 1.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 10.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_l

{'depth': 19.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 200.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_

{'depth': 15.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 1.0, 'text_len': 54.0, 'descendant1_no_nodes': 1.0, 'descendant1_no_children_avg': 1.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 53.0, 'descendant2_no_nodes': 1.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 51.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id

{'depth': 17.0, 'sibling_pos': 22.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 1.0, 'text_len': 26.0, 'descendant1_no_nodes': 1.0, 'descendant1_no_children_avg': 1.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 26.0, 'descendant2_no_nodes': 1.0, 'descendant2_no_children_avg': 1.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 26.0, 'descendant3_no_nodes': 1.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 24.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_

{'depth': 8.0, 'sibling_pos': 2.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 1.0, 'text_len': 0.0, 'descendant1_no_nodes': 1.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_len

{'depth': 10.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 0.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_le

{'depth': 8.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 1.0, 'text_len': 0.0, 'descendant1_no_nodes': 1.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_len

{'depth': 4.0, 'sibling_pos': 6.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 0.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_len

{'depth': 17.0, 'sibling_pos': 3.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 0.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_le

{'depth': 17.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 40.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_l

{'depth': 5.0, 'sibling_pos': 2.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 0.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_len

{'depth': 19.0, 'sibling_pos': 2.0, 'no_classes': 1.0, 'id_len': 0.0, 'class_len': 20.0, 'no_children': 1.0, 'text_len': 0.0, 'descendant1_no_nodes': 1.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_l

{'depth': 20.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 7.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_le

{'depth': 14.0, 'sibling_pos': 7.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 0.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_le

{'depth': 12.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 0.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_le

{'depth': 4.0, 'sibling_pos': 11.0, 'no_classes': 0.0, 'id_len': 10.0, 'class_len': 0.0, 'no_children': 1.0, 'text_len': 28.0, 'descendant1_no_nodes': 1.0, 'descendant1_no_children_avg': 1.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 16.0, 'descendant2_no_nodes': 1.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id

{'depth': 6.0, 'sibling_pos': 12.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 0.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_le

{'depth': 8.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 1.0, 'text_len': 19.0, 'descendant1_no_nodes': 1.0, 'descendant1_no_children_avg': 1.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 19.0, 'descendant2_no_nodes': 1.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 7.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_l

{'depth': 8.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 1.0, 'text_len': 19.0, 'descendant1_no_nodes': 1.0, 'descendant1_no_children_avg': 1.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 19.0, 'descendant2_no_nodes': 1.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 7.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_l

{'depth': 7.0, 'sibling_pos': 0.0, 'no_classes': 0.0, 'id_len': 0.0, 'class_len': 0.0, 'no_children': 0.0, 'text_len': 7.0, 'descendant1_no_nodes': 0.0, 'descendant1_no_children_avg': 0.0, 'descendant1_id_len_avg': 0.0, 'descendant1_no_classes_avg': 0.0, 'descendant1_class_len_avg': 0.0, 'descendant1_text_len_avg': 0.0, 'descendant2_no_nodes': 0.0, 'descendant2_no_children_avg': 0.0, 'descendant2_id_len_avg': 0.0, 'descendant2_no_classes_avg': 0.0, 'descendant2_class_len_avg': 0.0, 'descendant2_text_len_avg': 0.0, 'descendant3_no_nodes': 0.0, 'descendant3_no_children_avg': 0.0, 'descendant3_id_len_avg': 0.0, 'descendant3_no_classes_avg': 0.0, 'descendant3_class_len_avg': 0.0, 'descendant3_text_len_avg': 0.0, 'descendant4_no_nodes': 0.0, 'descendant4_no_children_avg': 0.0, 'descendant4_id_len_avg': 0.0, 'descendant4_no_classes_avg': 0.0, 'descendant4_class_len_avg': 0.0, 'descendant4_text_len_avg': 0.0, 'descendant5_no_nodes': 0.0, 'descendant5_no_children_avg': 0.0, 'descendant5_id_len

KeyboardInterrupt: 

In [95]:
# conversion functions
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[int(value)]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode() if type(value) is str else value]))

def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

In [96]:
def example_constructor_from_tf_types(feature_types):
    """Given a dict of {'feature_name': tf.type} generate an example-from-dict
    function that takes a dict, like the one returned
    from `sess.run()` and returns an `Example` object"""
    # map tf.types to functions
    func_mapping = {
        tf.int64: _int64_feature,
        tf.string: _bytes_feature,
        tf.float64: _float_feature
    }
    feature_func_mapping = {
        k: func_mapping[v] for k,v in feature_types.items()
    }
    
    def convert(feature_values):
        return tf.train.Example(features=tf.train.Features(feature={k: feature_func_mapping[k](v) for k,v in feature_values.items()}))
    return convert

conv_func = example_constructor_from_tf_types({'a': tf.int64, 'b': tf.float64})
conv_func({'a': 2, 'b': 3.2})

features {
  feature {
    key: "a"
    value {
      int64_list {
        value: 2
      }
    }
  }
  feature {
    key: "b"
    value {
      float_list {
        value: 3.200000047683716
      }
    }
  }
}

In [98]:
def csv_to_tf_types(csv_pattern, **kws):
    """Given a csv_pattern, analyze the headers
    of those csv files and output a dict of {'feature_name': tf_type}"""
    ddf = dd.read_csv(csv_pattern, **kws)
    
    feature_type = {}
    for name, feat_type in zip(ddf.columns, ddf.dtypes):
        if np.issubdtype(feat_type, np.integer) or feat_type.name == 'bool':
            feat_type = tf.int64
        elif np.issubdtype(feat_type, np.float):
            feat_type = tf.float64
        else:
            feat_type = tf.string
        feature_type[name] = feat_type  # assign the feature type
    
    return feature_type

csv_to_tf_types('../data/final/cleaneval/dom-full-00.csv')['content_label']

tf.int64

In [99]:
func = example_constructor_from_tf_types(csv_to_tf_types('../data/final/cleaneval/dom-full-00.csv'))
ddf = dd.read_csv('../data/final/cleaneval/dom-full-00.csv')
record = {k: v[0] for k,v in ddf.head(1).to_dict().items()}

In [100]:
func(record)

features {
  feature {
    key: "ancestor1_class_len"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "ancestor1_depth"
    value {
      int64_list {
        value: 2
      }
    }
  }
  feature {
    key: "ancestor1_id_len"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "ancestor1_no_children"
    value {
      int64_list {
        value: 1
      }
    }
  }
  feature {
    key: "ancestor1_no_classes"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "ancestor1_sibling_pos"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "ancestor1_tag_"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "ancestor1_tag_a"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "ancestor1_tag_acronym"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    k

In [104]:
def feature_dict_from_tf_types(feature_types):
    """Returns a feature specification to pass
    to parse_single_example when decoding the data"""
    return {k: tf.FixedLenFeature([], v) for k, v in feature_types.items()}

feature_dict_from_tf_types(csv_to_tf_types('../data/final/cleaneval/dom-full-00.csv'))

{'depth': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'sibling_pos': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'no_classes': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'id_len': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'class_len': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'no_children': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'text_len': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'descendant1_no_nodes': FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 'descendant1_no_children_avg': FixedLenFeature(shape=[], dtype=tf.float64, default_value=None),
 'descendant1_id_len_avg': FixedLenFeature(shape=[], dtype=tf.float64, default_value=None),
 'descendant1_no_classes_avg': FixedLenFeature(shape=[], dtype=tf.float64, default_value=None),
 'descendant1_class_len_avg': FixedLenFeature(shape=[], dtype=tf.float64, default_value=Non

In [118]:
def csv_dataset(csv_pattern, label_name, convert_ints=False):
    """Creates a `Dataset` object from csv patterns.
    The label name is given throught the second argument."""
    def decode_line(line):
        # decode the csvs
        ddf = dd.read_csv(csv_pattern)
        dtypes = ddf.dtypes  # get the dtypes
        
        # decode them as tensors
        decoded_tensors = make_csv_decoder(line, dtypes, convert_ints=convert_ints)
        tens_dict = {k: v for k, v in zip(ddf.columns, decoded_tensors)}
        
        # pop the label and return the tuple
        label = tens_dict.pop(label_name)
        return tens_dict, label
        
    path =  tf.matching_files(csv_pattern)  # use the pattern
    dataset = tf.contrib.data.TextLineDataset(path).skip(1)  # line by line reader(no header)
    dataset = dataset.map(decode_line)

    return dataset    

In [119]:
dataset = csv_dataset('../data/final/cleaneval/dom-full-train-*.csv', 'content_label')

In [120]:
dataset.output_types

({'ancestor1_class_len': tf.int32,
  'ancestor1_depth': tf.int32,
  'ancestor1_id_len': tf.int32,
  'ancestor1_no_children': tf.int32,
  'ancestor1_no_classes': tf.int32,
  'ancestor1_sibling_pos': tf.int32,
  'ancestor1_tag_': tf.int32,
  'ancestor1_tag_a': tf.int32,
  'ancestor1_tag_acronym': tf.int32,
  'ancestor1_tag_address': tf.int32,
  'ancestor1_tag_align': tf.int32,
  'ancestor1_tag_applet': tf.int32,
  'ancestor1_tag_artbody': tf.int32,
  'ancestor1_tag_b': tf.int32,
  'ancestor1_tag_beginlock': tf.int32,
  'ancestor1_tag_big': tf.int32,
  'ancestor1_tag_blink': tf.int32,
  'ancestor1_tag_blockquote': tf.int32,
  'ancestor1_tag_body': tf.int32,
  'ancestor1_tag_caption': tf.int32,
  'ancestor1_tag_center': tf.int32,
  'ancestor1_tag_cite': tf.int32,
  'ancestor1_tag_code': tf.int32,
  'ancestor1_tag_colgroup': tf.int32,
  'ancestor1_tag_companyquotes': tf.int32,
  'ancestor1_tag_container': tf.int32,
  'ancestor1_tag_content': tf.int32,
  'ancestor1_tag_content_area': tf.int3

In [123]:
with tf.Session() as sess:
    iterator = dataset.make_one_shot_iterator()
    next_element = iterator.get_next()

    for i in range(10):
        print(sess.run(next_element))

({'ancestor1_class_len': 0, 'ancestor1_depth': 2, 'ancestor1_id_len': 0, 'ancestor1_no_children': 1, 'ancestor1_no_classes': 0, 'ancestor1_sibling_pos': 0, 'ancestor1_tag_': 0, 'ancestor1_tag_a': 0, 'ancestor1_tag_acronym': 0, 'ancestor1_tag_address': 0, 'ancestor1_tag_align': 0, 'ancestor1_tag_applet': 0, 'ancestor1_tag_artbody': 0, 'ancestor1_tag_b': 0, 'ancestor1_tag_beginlock': 0, 'ancestor1_tag_big': 0, 'ancestor1_tag_blink': 0, 'ancestor1_tag_blockquote': 0, 'ancestor1_tag_body': 1, 'ancestor1_tag_caption': 0, 'ancestor1_tag_center': 0, 'ancestor1_tag_cite': 0, 'ancestor1_tag_code': 0, 'ancestor1_tag_colgroup': 0, 'ancestor1_tag_companyquotes': 0, 'ancestor1_tag_container': 0, 'ancestor1_tag_content': 0, 'ancestor1_tag_content_area': 0, 'ancestor1_tag_content_footer': 0, 'ancestor1_tag_country-region': 0, 'ancestor1_tag_csactiondict': 0, 'ancestor1_tag_csactions': 0, 'ancestor1_tag_csobj': 0, 'ancestor1_tag_csscriptdict': 0, 'ancestor1_tag_cut': 0, 'ancestor1_tag_dd': 0, 'ancesto

({'ancestor1_class_len': 0, 'ancestor1_depth': 5, 'ancestor1_id_len': 0, 'ancestor1_no_children': 2, 'ancestor1_no_classes': 0, 'ancestor1_sibling_pos': 0, 'ancestor1_tag_': 0, 'ancestor1_tag_a': 0, 'ancestor1_tag_acronym': 0, 'ancestor1_tag_address': 0, 'ancestor1_tag_align': 0, 'ancestor1_tag_applet': 0, 'ancestor1_tag_artbody': 0, 'ancestor1_tag_b': 0, 'ancestor1_tag_beginlock': 0, 'ancestor1_tag_big': 0, 'ancestor1_tag_blink': 0, 'ancestor1_tag_blockquote': 0, 'ancestor1_tag_body': 0, 'ancestor1_tag_caption': 0, 'ancestor1_tag_center': 0, 'ancestor1_tag_cite': 0, 'ancestor1_tag_code': 0, 'ancestor1_tag_colgroup': 0, 'ancestor1_tag_companyquotes': 0, 'ancestor1_tag_container': 0, 'ancestor1_tag_content': 0, 'ancestor1_tag_content_area': 0, 'ancestor1_tag_content_footer': 0, 'ancestor1_tag_country-region': 0, 'ancestor1_tag_csactiondict': 0, 'ancestor1_tag_csactions': 0, 'ancestor1_tag_csobj': 0, 'ancestor1_tag_csscriptdict': 0, 'ancestor1_tag_cut': 0, 'ancestor1_tag_dd': 0, 'ancesto

In [None]:
def convert_csv_to_tfrecords(csv_pattern, tf_output, num_parallel_calls=4):
    def decode_line(line):
        # decode the csvs
        ddf = dd.read_csv(csv_pattern)
        dtypes = ddf.dtypes  # get the dtypes
        
        # decode them as tensors
        decoded_tensors = make_csv_decoder(line, dtypes, convert_ints=convert_ints)
        tens_dict = {k: v for k, v in zip(ddf.columns, decoded_tensors)}
        
        return dict