In [1]:
import os
os.environ["PYSPARK_PYTHON"] = "/usr/local/anaconda/bin/python"

In [1]:
from pyspark.sql.types import IntegerType, StringType, StructType, StructField, TimestampType, FloatType, ArrayType, MapType
import pyspark.sql.functions as F
from pyspark.ml.linalg import SparseVector, VectorUDT

from pyspark.context import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
import numpy as np
import pandas as pd
import hashlib

import math
import datetime
import time

from features import LABEL_COLUMN, DISPLAY_ID_COLUMN, AD_ID_COLUMN, IS_LEAK_COLUMN, DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN, CATEGORICAL_COLUMNS, DOC_CATEGORICAL_MULTIVALUED_COLUMNS, BOOL_COLUMNS, INT_COLUMNS, FLOAT_COLUMNS, FLOAT_COLUMNS_LOG_BIN_TRANSFORM, FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM


In [2]:
OUTPUT_BUCKET_FOLDER = "hdfs:/user/lzhao/data/outbrain/preprocessed/"
DATA_BUCKET_FOLDER = "hdfs:/user/lzhao/data/outbrain/"
SPARK_TEMP_FOLDER = "hdfs:/user/lzhao/data/outbrain/spark-temp/"
LOCAL_DATA_TFRECORDS_DIR = "../data/outbrain/tfreccords"

In [3]:
TENSORFLOW_HADOOP="../tensorflow-hadoop-1.15.0.jar"

In [4]:
conf = SparkConf()
conf = conf.setMaster("local[*]")
# conf = conf.set("spark.app.name", "recommend-ctr")
conf = conf.set("spark.executor.memory", "40g")
conf = conf.set("spark.driver.memory", "150g")
# conf = conf.set("spark.driver.maxResultSize", "3g")
# conf = conf.set("spark.executor.instances", "110")
conf.set("spark.jars", TENSORFLOW_HADOOP)
conf.set("spark.local.dir", "SPARK_TEMP_FOLDER")
#conf = conf.set("spark.default.parallelism", "200")

<pyspark.conf.SparkConf at 0x7fe2d2293e10>

In [5]:
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [6]:
batch_size = 4096

In [7]:
bool_feature_names = ['event_weekend',
                      'user_has_already_viewed_doc']

In [8]:
int_feature_names = ['user_views',
                    'ad_views',
                    'doc_views',
                    'doc_event_days_since_published',
                    'doc_event_hour',
                    'doc_ad_days_since_published',
                    ]

In [9]:
float_feature_names = [
                'pop_ad_id',
                'pop_ad_id_conf',
                'pop_ad_id_conf_multipl',
                'pop_document_id',
                'pop_document_id_conf',
                'pop_document_id_conf_multipl',
                'pop_publisher_id',
                'pop_publisher_id_conf',
                'pop_publisher_id_conf_multipl',
                'pop_advertiser_id',
                'pop_advertiser_id_conf',
                'pop_advertiser_id_conf_multipl',
                'pop_campain_id',
                'pop_campain_id_conf',
                'pop_campain_id_conf_multipl',
                'pop_doc_event_doc_ad',
                'pop_doc_event_doc_ad_conf',
                'pop_doc_event_doc_ad_conf_multipl',
                'pop_source_id',
                'pop_source_id_conf',
                'pop_source_id_conf_multipl',
                'pop_source_id_country',
                'pop_source_id_country_conf',
                'pop_source_id_country_conf_multipl',
                'pop_entity_id',
                'pop_entity_id_conf',
                'pop_entity_id_conf_multipl',
                'pop_entity_id_country',
                'pop_entity_id_country_conf',
                'pop_entity_id_country_conf_multipl',
                'pop_topic_id',
                'pop_topic_id_conf',
                'pop_topic_id_conf_multipl',
                'pop_topic_id_country',
                'pop_topic_id_country_conf',
                'pop_topic_id_country_conf_multipl',
                'pop_category_id',
                'pop_category_id_conf',
                'pop_category_id_conf_multipl',
                'pop_category_id_country',
                'pop_category_id_country_conf',
                'pop_category_id_country_conf_multipl',
                'user_doc_ad_sim_categories',
                'user_doc_ad_sim_categories_conf',
                'user_doc_ad_sim_categories_conf_multipl',
                'user_doc_ad_sim_topics',
                'user_doc_ad_sim_topics_conf',
                'user_doc_ad_sim_topics_conf_multipl',
                'user_doc_ad_sim_entities',
                'user_doc_ad_sim_entities_conf',
                'user_doc_ad_sim_entities_conf_multipl',
                'doc_event_doc_ad_sim_categories',
                'doc_event_doc_ad_sim_categories_conf',
                'doc_event_doc_ad_sim_categories_conf_multipl',
                'doc_event_doc_ad_sim_topics',
                'doc_event_doc_ad_sim_topics_conf',
                'doc_event_doc_ad_sim_topics_conf_multipl',
                'doc_event_doc_ad_sim_entities',
                'doc_event_doc_ad_sim_entities_conf',
                'doc_event_doc_ad_sim_entities_conf_multipl'
               ]

In [10]:
category_feature_names_integral = ['ad_advertiser',
 'doc_ad_category_id_1',
 'doc_ad_category_id_2',
 'doc_ad_category_id_3',
 'doc_ad_topic_id_1',
 'doc_ad_topic_id_2',
 'doc_ad_topic_id_3',
 'doc_ad_entity_id_1',
 'doc_ad_entity_id_2',
 'doc_ad_entity_id_3',
 'doc_ad_entity_id_4',
 'doc_ad_entity_id_5',
 'doc_ad_entity_id_6',
 'doc_ad_publisher_id',
 'doc_ad_source_id',
 'doc_event_category_id_1',
 'doc_event_category_id_2',
 'doc_event_category_id_3',
 'doc_event_topic_id_1',
 'doc_event_topic_id_2',
 'doc_event_topic_id_3',
 'doc_event_entity_id_1',
 'doc_event_entity_id_2',
 'doc_event_entity_id_3',
 'doc_event_entity_id_4',
 'doc_event_entity_id_5',
 'doc_event_entity_id_6',
 'doc_event_publisher_id',
 'doc_event_source_id',
 'event_country',
 'event_country_state',
 'event_geo_location',
 'event_hour',
 'event_platform',
 'traffic_source']

In [11]:
feature_vector_labels_integral = bool_feature_names + int_feature_names + float_feature_names + category_feature_names_integral


In [12]:
evaluation = True

submission = False


In [13]:
if submission:
    train_feature_vector_gcs_folder_name = 'train_feature_vectors_integral'
else:
    train_feature_vector_gcs_folder_name = 'train_feature_vectors_integral_eval'

In [14]:
train_feature_vectors_exported_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+train_feature_vector_gcs_folder_name)


In [15]:
train_feature_vectors_exported_df.take(2)

[Row(uuid='10005a0add15f6', display_id=5686397, ad_id=174547, document_id_event=2135921, document_id=1439845, label=0, feature_vector=SparseVector(103, {0: 1.0, 3: 87111.0, 4: 106124.0, 5: 1.0, 6: 4.0, 7: 124.0, 8: 0.0913, 9: 0.6848, 10: 0.0625, 11: 0.0831, 12: 0.4983, 13: 0.0414, 14: 0.0534, 15: 0.4446, 16: 0.0237, 17: 0.0455, 18: 0.42, 19: 0.0191, 20: 0.0865, 21: 0.5491, 22: 0.0475, 26: 0.0534, 27: 0.4446, 28: 0.0237, 29: 0.0563, 30: 0.4569, 31: 0.0257, 32: 0.0935, 33: 0.0759, 34: 0.0071, 35: 0.0966, 36: 0.0778, 37: 0.0075, 38: 0.1646, 39: 0.0002, 40: 0.0, 41: 0.1629, 42: 0.0002, 43: 0.0, 44: 0.2065, 45: 0.1339, 46: 0.0277, 47: 0.212, 48: 0.1326, 49: 0.0281, 59: 0.0, 60: 0.0004, 61: 0.0, 62: 0.0, 63: 0.0, 64: 0.0, 68: 2151.0, 69: 1302.0, 70: 1207.0, 72: 2.0, 75: 552.0, 76: 372.0, 81: 2618.0, 82: 9698.0, 83: 1702.0, 84: 1707.0, 86: 137.0, 95: 723.0, 96: 4194.0, 97: 18595447.0, 98: 745661.0, 99: 33260.0, 100: 4.0, 101: 1.0, 102: 0.0})),
 Row(uuid='10005a0add15f6', display_id=5686397, a

In [16]:
integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] + feature_vector_labels_integral


In [17]:
CSV_ORDERED_COLUMNS = ['label','display_id','ad_id','doc_id','doc_event_id','is_leak','event_weekend',
              'user_has_already_viewed_doc','user_views','ad_views','doc_views',
              'doc_event_days_since_published','doc_event_hour','doc_ad_days_since_published',
              'pop_ad_id','pop_ad_id_conf',
              'pop_ad_id_conf_multipl','pop_document_id','pop_document_id_conf',
              'pop_document_id_conf_multipl','pop_publisher_id','pop_publisher_id_conf',
              'pop_publisher_id_conf_multipl','pop_advertiser_id','pop_advertiser_id_conf',
              'pop_advertiser_id_conf_multipl','pop_campain_id','pop_campain_id_conf',
              'pop_campain_id_conf_multipl','pop_doc_event_doc_ad','pop_doc_event_doc_ad_conf',
              'pop_doc_event_doc_ad_conf_multipl','pop_source_id','pop_source_id_conf',
              'pop_source_id_conf_multipl','pop_source_id_country','pop_source_id_country_conf',
              'pop_source_id_country_conf_multipl','pop_entity_id','pop_entity_id_conf',
              'pop_entity_id_conf_multipl','pop_entity_id_country','pop_entity_id_country_conf',
              'pop_entity_id_country_conf_multipl','pop_topic_id','pop_topic_id_conf',
              'pop_topic_id_conf_multipl','pop_topic_id_country','pop_topic_id_country_conf',
              'pop_topic_id_country_conf_multipl','pop_category_id','pop_category_id_conf',
              'pop_category_id_conf_multipl','pop_category_id_country','pop_category_id_country_conf',
              'pop_category_id_country_conf_multipl','user_doc_ad_sim_categories',
              'user_doc_ad_sim_categories_conf','user_doc_ad_sim_categories_conf_multipl',
              'user_doc_ad_sim_topics','user_doc_ad_sim_topics_conf','user_doc_ad_sim_topics_conf_multipl',
              'user_doc_ad_sim_entities','user_doc_ad_sim_entities_conf','user_doc_ad_sim_entities_conf_multipl',
              'doc_event_doc_ad_sim_categories','doc_event_doc_ad_sim_categories_conf',
              'doc_event_doc_ad_sim_categories_conf_multipl','doc_event_doc_ad_sim_topics',
              'doc_event_doc_ad_sim_topics_conf','doc_event_doc_ad_sim_topics_conf_multipl',
              'doc_event_doc_ad_sim_entities','doc_event_doc_ad_sim_entities_conf',
              'doc_event_doc_ad_sim_entities_conf_multipl','ad_advertiser','doc_ad_category_id_1',
              'doc_ad_category_id_2','doc_ad_category_id_3','doc_ad_topic_id_1','doc_ad_topic_id_2',
              'doc_ad_topic_id_3','doc_ad_entity_id_1','doc_ad_entity_id_2','doc_ad_entity_id_3',
              'doc_ad_entity_id_4','doc_ad_entity_id_5','doc_ad_entity_id_6','doc_ad_publisher_id',
              'doc_ad_source_id','doc_event_category_id_1','doc_event_category_id_2','doc_event_category_id_3',
              'doc_event_topic_id_1','doc_event_topic_id_2','doc_event_topic_id_3','doc_event_entity_id_1',
              'doc_event_entity_id_2','doc_event_entity_id_3','doc_event_entity_id_4','doc_event_entity_id_5',
              'doc_event_entity_id_6','doc_event_publisher_id','doc_event_source_id','event_country',
              'event_country_state','event_geo_location','event_hour','event_platform','traffic_source']

In [18]:
FEAT_CSV_ORDERED_COLUMNS = ['event_weekend',
              'user_has_already_viewed_doc','user_views','ad_views','doc_views',
              'doc_event_days_since_published','doc_event_hour','doc_ad_days_since_published',
              'pop_ad_id','pop_ad_id_conf',
              'pop_ad_id_conf_multipl','pop_document_id','pop_document_id_conf',
              'pop_document_id_conf_multipl','pop_publisher_id','pop_publisher_id_conf',
              'pop_publisher_id_conf_multipl','pop_advertiser_id','pop_advertiser_id_conf',
              'pop_advertiser_id_conf_multipl','pop_campain_id','pop_campain_id_conf',
              'pop_campain_id_conf_multipl','pop_doc_event_doc_ad','pop_doc_event_doc_ad_conf',
              'pop_doc_event_doc_ad_conf_multipl','pop_source_id','pop_source_id_conf',
              'pop_source_id_conf_multipl','pop_source_id_country','pop_source_id_country_conf',
              'pop_source_id_country_conf_multipl','pop_entity_id','pop_entity_id_conf',
              'pop_entity_id_conf_multipl','pop_entity_id_country','pop_entity_id_country_conf',
              'pop_entity_id_country_conf_multipl','pop_topic_id','pop_topic_id_conf',
              'pop_topic_id_conf_multipl','pop_topic_id_country','pop_topic_id_country_conf',
              'pop_topic_id_country_conf_multipl','pop_category_id','pop_category_id_conf',
              'pop_category_id_conf_multipl','pop_category_id_country','pop_category_id_country_conf',
              'pop_category_id_country_conf_multipl','user_doc_ad_sim_categories',
              'user_doc_ad_sim_categories_conf','user_doc_ad_sim_categories_conf_multipl',
              'user_doc_ad_sim_topics','user_doc_ad_sim_topics_conf','user_doc_ad_sim_topics_conf_multipl',
              'user_doc_ad_sim_entities','user_doc_ad_sim_entities_conf','user_doc_ad_sim_entities_conf_multipl',
              'doc_event_doc_ad_sim_categories','doc_event_doc_ad_sim_categories_conf',
              'doc_event_doc_ad_sim_categories_conf_multipl','doc_event_doc_ad_sim_topics',
              'doc_event_doc_ad_sim_topics_conf','doc_event_doc_ad_sim_topics_conf_multipl',
              'doc_event_doc_ad_sim_entities','doc_event_doc_ad_sim_entities_conf',
              'doc_event_doc_ad_sim_entities_conf_multipl','ad_advertiser','doc_ad_category_id_1',
              'doc_ad_category_id_2','doc_ad_category_id_3','doc_ad_topic_id_1','doc_ad_topic_id_2',
              'doc_ad_topic_id_3','doc_ad_entity_id_1','doc_ad_entity_id_2','doc_ad_entity_id_3',
              'doc_ad_entity_id_4','doc_ad_entity_id_5','doc_ad_entity_id_6','doc_ad_publisher_id',
              'doc_ad_source_id','doc_event_category_id_1','doc_event_category_id_2','doc_event_category_id_3',
              'doc_event_topic_id_1','doc_event_topic_id_2','doc_event_topic_id_3','doc_event_entity_id_1',
              'doc_event_entity_id_2','doc_event_entity_id_3','doc_event_entity_id_4','doc_event_entity_id_5',
              'doc_event_entity_id_6','doc_event_publisher_id','doc_event_source_id','event_country',
              'event_country_state','event_geo_location','event_hour','event_platform','traffic_source']

In [19]:
def to_array(col):
    def to_array_(v):
        return v.toArray().tolist()
    # Important: asNondeterministic requires Spark 2.3 or later
    # It can be safely removed i.e.
    # return udf(to_array_, ArrayType(DoubleType()))(col)
    # but at the cost of decreased performance
 
    return F.udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)

In [20]:
CONVERT_TO_INT = ['doc_ad_category_id_1',
              'doc_ad_category_id_2','doc_ad_category_id_3','doc_ad_topic_id_1','doc_ad_topic_id_2',
              'doc_ad_topic_id_3','doc_ad_entity_id_1','doc_ad_entity_id_2','doc_ad_entity_id_3',
              'doc_ad_entity_id_4','doc_ad_entity_id_5','doc_ad_entity_id_6',
              'doc_ad_source_id','doc_event_category_id_1','doc_event_category_id_2','doc_event_category_id_3',
              'doc_event_topic_id_1','doc_event_topic_id_2','doc_event_topic_id_3','doc_event_entity_id_1',
              'doc_event_entity_id_2','doc_event_entity_id_3','doc_event_entity_id_4','doc_event_entity_id_5', 'doc_event_entity_id_6']

In [21]:
def format_number(element, name):
    if name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
        return element.cast("int")
    elif name in CONVERT_TO_INT:
        return element.cast("int")
    else:
        return element

In [22]:
def to_array_with_none(col):
    def to_array_with_none_(v):
        tmp= np.full((v.size,), fill_value=None, dtype=np.float64)
        tmp[v.indices] = v.values
        return tmp.tolist()
    # Important: asNondeterministic requires Spark 2.3 or later
    # It can be safely removed i.e.
    # return udf(to_array_, ArrayType(DoubleType()))(col)
    # but at the cost of decreased performance
 
    return F.udf(to_array_with_none_, ArrayType(DoubleType())).asNondeterministic()(col)

In [23]:
def replace_with_most_frequent(most_value):
    return F.udf( lambda x: most_value if not x or np.isnan(x) else x)

In [24]:
from pyspark.sql.types import *

In [25]:
train_feature_vectors_integral_csv_rdd_df = train_feature_vectors_exported_df \
.select('label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'feature_vector') \
.withColumn('is_leak', F.lit(-1)).withColumn("featvec", to_array("feature_vector")) \
.select(['label'] + ['display_id'] + ['ad_id'] + ['document_id'] + ['document_id_event'] + ['is_leak'] + [format_number(element, FEAT_CSV_ORDERED_COLUMNS[index]).alias(FEAT_CSV_ORDERED_COLUMNS[index]) for index, element in enumerate([F.col("featvec")[i] for i in range(len(feature_vector_labels_integral))])])\
.replace(float('nan'), 0)


In [26]:
train_feature_vectors_integral_csv_rdd_df.take(1)

[Row(label=0, display_id=5686397, ad_id=174547, document_id=1439845, document_id_event=2135921, is_leak=-1, event_weekend=1, user_has_already_viewed_doc=0, user_views=0.0, ad_views=87111.0, doc_views=106124.0, doc_event_days_since_published=1.0, doc_event_hour=4.0, doc_ad_days_since_published=124.0, pop_ad_id=0.09133175015449524, pop_ad_id_conf=0.6848396305407448, pop_ad_id_conf_multipl=0.06254760203244414, pop_document_id=0.08310090005397797, pop_document_id_conf=0.4983114381899986, pop_document_id_conf_multipl=0.041410129020781096, pop_publisher_id=0.05335606634616852, pop_publisher_id_conf=0.4445772362756268, pop_publisher_id_conf_multipl=0.02372089251471858, pop_advertiser_id=0.04552464559674263, pop_advertiser_id_conf=0.4199930658825777, pop_advertiser_id_conf_multipl=0.019120035477393726, pop_campain_id=0.08649083971977234, pop_campain_id_conf=0.549118229426116, pop_campain_id_conf_multipl=0.04749369676849937, pop_doc_event_doc_ad=0.0, pop_doc_event_doc_ad_conf=0.0, pop_doc_event

In [27]:
if submission:
    test_validation_feature_vector_gcs_folder_name = 'test_feature_vectors_integral'
else:
    test_validation_feature_vector_gcs_folder_name = 'validation_feature_vectors_integral'

In [28]:
test_validation_feature_vectors_exported_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+test_validation_feature_vector_gcs_folder_name)
test_validation_feature_vectors_exported_df.take(2)

[Row(uuid='100289071872c9', display_id=3110701, ad_id=327512, document_id_event=1973614, document_id=1286118, label=0, is_leak=0, feature_vector=SparseVector(103, {0: 0.0, 1: 0.0, 2: 43.0, 3: 1316.0, 4: 45696.0, 5: 1.0, 6: 4.0, 7: 79.0, 8: 0.2371, 9: 0.4325, 10: 0.1025, 11: 0.2615, 12: 0.3102, 13: 0.0811, 14: 0.2415, 15: 0.3236, 16: 0.0782, 17: 0.2415, 18: 0.3236, 19: 0.0782, 20: 0.2145, 21: 0.3359, 22: 0.072, 23: 0.0, 24: 0.0575, 25: 0.0, 26: 0.2415, 27: 0.3236, 28: 0.0782, 29: 0.2439, 30: 0.3463, 31: 0.0845, 32: 0.2555, 33: 0.0076, 34: 0.002, 35: 0.2641, 36: 0.0079, 37: 0.0021, 38: 0.2629, 39: 0.014, 40: 0.0037, 41: 0.2311, 42: 0.0166, 43: 0.0038, 44: 0.2299, 45: 0.0741, 46: 0.017, 47: 0.2324, 48: 0.0709, 49: 0.0165, 50: 0.9143, 51: 1.0, 52: 0.9143, 53: 0.0, 54: 0.0005, 55: 0.0, 56: 0.0, 57: 0.0, 58: 0.0, 59: 0.9659, 60: 0.9996, 61: 0.9655, 62: 0.0, 63: 0.0, 64: 0.0, 65: 0.0, 66: 0.0, 67: 0.0, 68: 2635.0, 69: 1403.0, 70: 1406.0, 72: 227.0, 75: 429.0, 81: 5911.0, 82: 8905.0, 83: 1403.

In [29]:
test_validation_feature_vectors_integral_csv_rdd_df = test_validation_feature_vectors_exported_df.select(
    'label', 'display_id', 'ad_id', 'document_id', 'document_id_event',
    'is_leak', 'feature_vector').withColumn("featvec", to_array("feature_vector")).select(['label'] + ['display_id'] + ['ad_id'] + ['document_id'] + ['document_id_event'] + ['is_leak'] + [format_number(element, FEAT_CSV_ORDERED_COLUMNS[index]).alias(FEAT_CSV_ORDERED_COLUMNS[index]) for index, element in enumerate([F.col("featvec")[i] for i in range(len(feature_vector_labels_integral))])]).replace(float('nan'), 0)


In [30]:
import tensorflow as tf
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import metadata_io

In [31]:
def make_spec(output_dir, batch_size=None):
    fixed_shape = [batch_size,1] if batch_size is not None else []
    spec = {}
    spec[LABEL_COLUMN] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
    spec[DISPLAY_ID_COLUMN] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
    spec[IS_LEAK_COLUMN] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
    spec[DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
    for name in BOOL_COLUMNS:
        spec[name] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM+FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
        spec[name] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
    for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
        spec[name + '_binned'] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
        spec[name + '_binned'] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
        spec[name + '_log_01scaled'] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
    for name in INT_COLUMNS:
        spec[name + '_log_int'] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
        spec[name + '_log_01scaled'] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
    for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
        spec[name] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
    for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
        shape = fixed_shape[:-1]+[len(DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category])]
        spec[multi_category] = tf.io.FixedLenFeature(shape=shape, dtype=tf.int64)
    metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(spec))
    metadata_io.write_metadata(metadata, output_dir)

In [32]:
make_spec(LOCAL_DATA_TFRECORDS_DIR + '/transformed_metadata', batch_size=batch_size)

Instructions for updating:
from_feature_spec is a deprecated, use schema_utils.schema_from_feature_spec


In [33]:
def log2_1p(x):
    return np.log1p(x) / np.log(2.0)

In [34]:
def compute_min_max_logs(df):
    print(str(datetime.datetime.now()) + '\tComputing min and max')
    min_logs = {}
    max_logs = {}
    all_dict = {}
    float_expr = []
    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM + INT_COLUMNS:
        float_expr.append(F.min(name))
        float_expr.append(F.max(name))
    floatDf = df.agg(*float_expr).collect()
    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
        minAgg = floatDf[0]["min("+name+")"]
        maxAgg = floatDf[0]["max("+name+")"]
        min_logs[name + '_log_01scaled'] = log2_1p(minAgg*1000)
        max_logs[name + '_log_01scaled'] = log2_1p(maxAgg*1000)
    for name in  INT_COLUMNS:
        minAgg = floatDf[0]["min("+name+")"]
        maxAgg = floatDf[0]["max("+name+")"]
        min_logs[name + '_log_01scaled'] = log2_1p(minAgg)
        max_logs[name + '_log_01scaled'] = log2_1p(maxAgg)

    return min_logs, max_logs

In [35]:
all_df = test_validation_feature_vectors_integral_csv_rdd_df.union(train_feature_vectors_integral_csv_rdd_df)


In [36]:
min_logs, max_logs = compute_min_max_logs(all_df)


2020-07-06 12:33:07.742072	Computing min and max


In [38]:
if submission:
    train_output_string = '/sub_train'
    eval_output_string = '/test'
else:
    train_output_string = '/train'
    eval_output_string = '/eval'

In [39]:
path = LOCAL_DATA_TFRECORDS_DIR


In [40]:
def create_tf_example_spark(df, min_logs, max_logs):
    result = {}
    result[LABEL_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[LABEL_COLUMN].to_list()))
    result[DISPLAY_ID_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[DISPLAY_ID_COLUMN].to_list()))
    result[IS_LEAK_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[IS_LEAK_COLUMN].to_list()))
    encoded_value = df[DISPLAY_ID_COLUMN].multiply(10).add(df[IS_LEAK_COLUMN].clip(lower=0)).to_list()
    result[DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=encoded_value))
    for name in FLOAT_COLUMNS:
        value = df[name].to_list()
        result[name] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
    for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
        value = df[name].multiply(10).astype('int64').to_list()
        result[name + '_binned'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
        value_prelim = df[name].multiply(1000).apply(np.log1p).multiply(1./np.log(2.0))
        value = value_prelim.astype('int64').to_list()
        result[name + '_binned'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
        nn = name + '_log_01scaled'
        value = value_prelim.add(-min_logs[nn]).multiply(1./(max_logs[nn]-min_logs[nn])).to_list()
        result[nn] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
    for name in INT_COLUMNS:
        value_prelim = df[name].apply(np.log1p).multiply(1./np.log(2.0))
        value = value_prelim.astype('int64').to_list()
        result[name + '_log_int'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
        nn = name + '_log_01scaled'
        value = value_prelim.add(-min_logs[nn]).multiply(1./(max_logs[nn]-min_logs[nn])).to_list()
        result[nn] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
    for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
        value = df[name].fillna(0).astype('int64').to_list()
        result[name] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
    for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
        values = []
        for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category]:
            values = values + [df[category].to_numpy()]
    # need to transpose the series so they will be parsed correctly by the FixedLenFeature
    # we can pass in a single series here; they'll be reshaped to [batch_size, num_values]
    # when parsed from the TFRecord
        value = np.stack(values, axis=1).flatten().tolist()
        result[multi_category] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
    tf_example = tf.train.Example(features=tf.train.Features(feature=result))
    return tf_example

In [41]:
def _transform_to_tfrecords(rdds):
    csv = pd.DataFrame(list(rdds), columns=CSV_ORDERED_COLUMNS)
    num_rows = len(csv.index)
    examples = []
    for start_ind in range(0,num_rows,batch_size if batch_size is not None else 1): # for each batch
        if start_ind + batch_size - 1 > num_rows: # if we'd run out of rows
            csv_slice = csv.iloc[start_ind:] 
            # drop the remainder
            print("last Example has: ", len(csv_slice))
            examples.append((create_tf_example_spark(csv_slice, min_logs, max_logs), len(csv_slice)))
            return examples
        else:
            csv_slice = csv.iloc[start_ind:start_ind+(batch_size if batch_size is not None else 1)]
        examples.append((create_tf_example_spark(csv_slice, min_logs, max_logs), batch_size))
    return examples

In [42]:
from pyspark import TaskContext


In [43]:
max_partition_num = 30


In [44]:
def _transform_to_slices(rdds):
    taskcontext = TaskContext.get()
    partitionid = taskcontext.partitionId()
    csv = pd.DataFrame(list(rdds), columns=CSV_ORDERED_COLUMNS)
    num_rows = len(csv.index)
    print("working with partition: ", partitionid, max_partition_num, num_rows)
    examples = []
    for start_ind in range(0,num_rows,batch_size if batch_size is not None else 1): # for each batch
        if start_ind + batch_size - 1 > num_rows: # if we'd run out of rows
            csv_slice = csv.iloc[start_ind:] 
            print("last Example has: ", len(csv_slice), partitionid)
            examples.append((csv_slice, len(csv_slice)))
            return examples
        else:
            csv_slice = csv.iloc[start_ind:start_ind+(batch_size if batch_size is not None else 1)]
        examples.append((csv_slice, len(csv_slice)))
    return examples

In [45]:
def _transform_to_tfrecords_from_slices(rdds):
    examples = []
    for slice in rdds:
        if len(slice[0]) != batch_size:
            print("slice size is not correct, dropping: ", len(slice[0]))
        else:
            examples.append((bytearray((create_tf_example_spark(slice[0], min_logs, max_logs)).SerializeToString()), None))
    return examples

In [46]:
def _transform_to_tfrecords_from_reslice(rdds):
    examples = []
    all_dataframes = pd.DataFrame([])
    for slice in rdds:
        all_dataframes = all_dataframes.append(slice[0])
        num_rows = len(all_dataframes.index)
    examples = []
    for start_ind in range(0,num_rows,batch_size if batch_size is not None else 1): # for each batch
        if start_ind + batch_size - 1 > num_rows: # if we'd run out of rows
            csv_slice = all_dataframes.iloc[start_ind:]
            if TEST_SET_MODE:
                remain_len = batch_size - len(csv_slice)
                (m, n) = divmod(remain_len, len(csv_slice))
                print("remainder: ", len(csv_slice), remain_len, m, n)
                if m:
                    for i in range(m):
                        csv_slice = csv_slice.append(csv_slice)
                csv_slice = csv_slice.append(csv_slice.iloc[:n])
                print("after fill remainder: ", len(csv_slice))
                examples.append((bytearray((create_tf_example_spark(csv_slice, min_logs, max_logs)).SerializeToString()), None))
                return examples
            # drop the remainder
            print("dropping remainder: ", len(csv_slice))
            return examples
        else:
            csv_slice = all_dataframes.iloc[start_ind:start_ind+(batch_size if batch_size is not None else 1)]
        examples.append((bytearray((create_tf_example_spark(csv_slice, min_logs, max_logs)).SerializeToString()), None))
    return examples

In [47]:
TEST_SET_MODE = False


In [48]:
train_features = train_feature_vectors_integral_csv_rdd_df.coalesce(30).rdd.mapPartitions(_transform_to_slices)


In [49]:
cached_train_features = train_features.cache()
