In [1]:
import os
os.environ["PYSPARK_PYTHON"] = "/usr/local/anaconda/bin/python"

In [2]:
from pyspark.sql.types import IntegerType, StringType, StructType, StructField, TimestampType, FloatType, ArrayType, MapType
import pyspark.sql.functions as F
from pyspark.ml.linalg import SparseVector, VectorUDT

from pyspark.context import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
import numpy as np
import pandas as pd
import hashlib

import math
import datetime
import time

In [3]:
conf = SparkConf()
conf = conf.setMaster("yarn")
conf = conf.set("spark.app.name", "recommend-ctr")
conf = conf.set("spark.executor.memory", "6g")
conf = conf.set("spark.driver.memory", "8g")
conf = conf.set("spark.driver.maxResultSize", "3g")
conf = conf.set("spark.executor.instances", "110")
conf = conf.set("spark.default.parallelism", "200")

In [4]:
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

## UDFs

In [5]:
def date_time_to_unix_epoch(date_time):
    return int(time.mktime(date_time.timetuple()))

In [6]:
def date_time_to_unix_epoch_treated(dt):
    if dt != None:
        try:
            epoch = date_time_to_unix_epoch(dt)
            return epoch
        except Exception as e:
            print("Error processing dt={}".format(dt), e)
            return 0
    else:
        return 0

In [7]:
timestamp_null_to_zero_int_udf = F.udf(lambda x: date_time_to_unix_epoch_treated(x), IntegerType())

In [8]:
INT_DEFAULT_NULL_VALUE = -1
int_null_to_minus_one_udf = F.udf(lambda x: x if x != None else INT_DEFAULT_NULL_VALUE, IntegerType())
int_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(IntegerType()))
float_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(FloatType()))
str_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(StringType()))

In [9]:
def truncate_day_from_timestamp(ts):
    return int(ts / 1000 / 60 / 60 / 24)

In [10]:
truncate_day_from_timestamp_udf = F.udf(lambda ts: truncate_day_from_timestamp(ts), IntegerType())

In [11]:
extract_country_udf = F.udf(lambda geo: geo.strip()[:2] if geo != None else '', StringType())

In [12]:
extract_country_state_udf = F.udf(lambda geo: geo.strip()[:5] if geo != None else '', StringType())

In [13]:
list_len_udf = F.udf(lambda x: len(x) if x != None else 0, IntegerType())

In [14]:
def convert_odd_timestamp(timestamp_ms_relative):
    TIMESTAMP_DELTA=1465876799998
    return datetime.datetime.fromtimestamp((int(timestamp_ms_relative)+TIMESTAMP_DELTA)//1000)

# Loading Files

## Loading UTC/BST for each country and US / CA states (local time)

In [15]:
country_utc_dst_df = pd.read_csv('./data/country_codes_utc_dst_tz_delta.csv', keep_default_na=False)

In [16]:
countries_utc_dst_dict = dict(zip(country_utc_dst_df['country_code'].tolist(), country_utc_dst_df['utc_dst_time_offset_cleaned'].tolist()))

In [17]:
countries_utc_dst_broad = sc.broadcast(countries_utc_dst_dict)

In [18]:
us_states_utc_dst_df = pd.read_csv('./data/us_states_abbrev_bst.csv', keep_default_na=False)

In [19]:
us_states_utc_dst_dict = dict(zip(us_states_utc_dst_df['state_abb'].tolist(), us_states_utc_dst_df['utc_dst_time_offset_cleaned'].tolist()))
us_states_utc_dst_broad = sc.broadcast(us_states_utc_dst_dict)

In [20]:
ca_states_utc_dst_df = pd.read_csv('./data/ca_states_abbrev_bst.csv', keep_default_na=False)

In [21]:
ca_countries_utc_dst_dict = dict(zip(ca_states_utc_dst_df['state_abb'].tolist(), ca_states_utc_dst_df['utc_dst_time_offset_cleaned'].tolist()))
ca_countries_utc_dst_broad = sc.broadcast(ca_countries_utc_dst_dict)

## Loading competition csvs

In [22]:
OUTPUT_BUCKET_FOLDER = "hdfs:/user/lzhao/data/outbrain/preprocessed/"
DATA_BUCKET_FOLDER = "hdfs:/user/lzhao/data/outbrain/"
SPARK_TEMP_FOLDER = "hdfs:/user/lzhao/data/outbrain/spark-temp/"

In [23]:
events_schema = StructType(
  [StructField("display_id", IntegerType(), True),
  StructField("uuid_event", StringType(), True),                    
  StructField("document_id_event", IntegerType(), True),
  StructField("timestamp_event", IntegerType(), True),
  StructField("platform_event", IntegerType(), True),
  StructField("geo_location_event", StringType(), True)]
  )

events_df = spark.read.schema(events_schema) \
  .options(header='true', inferschema='false', nullValue='\\N') \
  .csv(DATA_BUCKET_FOLDER + "events.csv") \
  .withColumn('dummyEvents', F.lit(1)) \
  .withColumn('day_event', truncate_day_from_timestamp_udf('timestamp_event')) \
  .withColumn('event_country', extract_country_udf('geo_location_event')) \
  .withColumn('event_country_state', extract_country_state_udf('geo_location_event')) \
  .alias('events') 

In [24]:
events_df.count()

23120126

In [25]:
# Drop rows with empty "geo_location"
events_df = events_df.dropna(subset="geo_location_event")
events_df.count()

23119786

In [26]:
# Drop rows with empty "platform"
events_df = events_df.dropna(subset="platform_event")
events_df.count()

23119781

In [27]:
page_views_schema = StructType(
  [StructField("uuid_pv", StringType(), True),
  StructField("document_id_pv", IntegerType(), True),
  StructField("timestamp_pv", IntegerType(), True),
  StructField("platform_pv", IntegerType(), True),
  StructField("geo_location_pv", StringType(), True),
  StructField("traffic_source_pv", IntegerType(), True)]
  )

page_views_df = spark.read.schema(page_views_schema) \
  .options(header='true', inferschema='false', nullValue='\\N') \
  .csv(DATA_BUCKET_FOLDER+"page_views.csv") \
  .withColumn('day_pv', truncate_day_from_timestamp_udf('timestamp_pv')) \
  .alias('page_views')        
            
page_views_df.createOrReplaceTempView('page_views')

In [28]:
page_views_df.show(5)

+--------------+--------------+------------+-----------+---------------+-----------------+------+
|       uuid_pv|document_id_pv|timestamp_pv|platform_pv|geo_location_pv|traffic_source_pv|day_pv|
+--------------+--------------+------------+-----------+---------------+-----------------+------+
|1fd5f051fba643|           120|    31905835|          1|             RS|                2|     0|
|8557aa9004be3b|           120|    32053104|          1|          VN>44|                2|     0|
|c351b277a358f0|           120|    54013023|          1|          KR>12|                1|     0|
|8205775c5387f9|           120|    44196592|          1|          IN>16|                2|     0|
|9cb0ccd8458371|           120|    65817371|          1|      US>CA>807|                2|     0|
+--------------+--------------+------------+-----------+---------------+-----------------+------+
only showing top 5 rows



In [29]:
page_views_users_df  = spark.sql('''
  SELECT uuid_pv, document_id_pv, max(timestamp_pv) as max_timestamp_pv, 1 as dummyPageView
  FROM page_views p 
  GROUP BY uuid_pv, document_id_pv
  ''').alias('page_views_users')

In [30]:
promoted_content_schema = StructType(
  [StructField("ad_id", IntegerType(), True),
  StructField("document_id_promo", IntegerType(), True),                    
  StructField("campaign_id", IntegerType(), True),
  StructField("advertiser_id", IntegerType(), True)]
  )

promoted_content_df = spark.read.schema(promoted_content_schema) \
  .options(header='true', inferschema='false', nullValue='\\N') \
  .csv(DATA_BUCKET_FOLDER+"promoted_content.csv") \
  .withColumn('dummyPromotedContent', F.lit(1)).alias('promoted_content').cache()

In [31]:
documents_meta_schema = StructType(
  [StructField("document_id_doc", IntegerType(), True),
  StructField("source_id", IntegerType(), True),                    
  StructField("publisher_id", IntegerType(), True),
  StructField("publish_time", TimestampType(), True)]
  )

documents_meta_df = spark.read.schema(documents_meta_schema) \
  .options(header='true', inferschema='false', nullValue='\\N') \
  .csv(DATA_BUCKET_FOLDER+"documents_meta.csv") \
  .withColumn('dummyDocumentsMeta', F.lit(1)).alias('documents_meta').cache()


In [32]:
# Drop rows with empty "source_id"
documents_meta_df = documents_meta_df.dropna(subset="source_id")
documents_meta_df.count()

2996816

In [33]:
source_publishers_df = documents_meta_df.select(["source_id", "publisher_id"]).dropDuplicates()
source_publishers_df.count()

14394

In [34]:
# get list of source_ids without publisher_id
rows_no_pub = source_publishers_df.filter("publisher_id is NULL")
source_ids_without_publisher = [row['source_id'] for row in rows_no_pub.collect()]
len(source_ids_without_publisher)

5058

In [35]:
# maximum value of publisher_id used so far
max_pub = max(source_publishers_df.select(["publisher_id"]).dropna().collect())['publisher_id']
max_pub

1263

In [36]:
# rows filled with new publisher_ids
new_publishers = [(source, max_pub + 1 + nr) for nr, source in enumerate(source_ids_without_publisher)]
new_publishers_df = spark.createDataFrame(new_publishers, ("source_id", "publisher_id"))
new_publishers_df.take(2)

[Row(source_id=9376, publisher_id=1264),
 Row(source_id=9465, publisher_id=1265)]

In [37]:
# old and new publishers merged
fixed_source_publishers_df = source_publishers_df.dropna().union(new_publishers_df)
fixed_source_publishers_df.collect()[-5:]

[Row(source_id=2555, publisher_id=6317),
 Row(source_id=422, publisher_id=6318),
 Row(source_id=5391, publisher_id=6319),
 Row(source_id=7874, publisher_id=6320),
 Row(source_id=8054, publisher_id=6321)]

In [38]:
# update documents_meta with bew publishers
documents_meta_df = documents_meta_df.drop('publisher_id').join(fixed_source_publishers_df, on='source_id')
documents_meta_df.count()

2996816

In [39]:
#Joining with Page Views to get traffic_source_pv
events_joined_df = events_df.join(documents_meta_df \
      .withColumnRenamed('source_id', 'source_id_doc_event') \
      .withColumnRenamed('publisher_id', 'publisher_doc_event') \
      .withColumnRenamed('publish_time', 'publish_time_doc_event'),
    on=F.col("document_id_event") == F.col("document_id_doc"), how='left') \
  .join(page_views_df, 
    on=[F.col('uuid_event') == F.col('uuid_pv'),
      F.col('document_id_event') == F.col('document_id_pv'),
      F.col('platform_event') == F.col('platform_pv'),
      F.col('geo_location_event') == F.col('geo_location_pv'),
      F.col('day_event') == F.col('day_pv')],
    how='left') \
  .alias('events').cache()

In [40]:
documents_categories_schema = StructType(
  [StructField("document_id_cat", IntegerType(), True),
  StructField("category_id", IntegerType(), True),                    
  StructField("confidence_level_cat", FloatType(), True)]
  )

documents_categories_df = spark.read.schema(documents_categories_schema) \
  .options(header='true', inferschema='false', nullValue='\\N') \
  .csv(DATA_BUCKET_FOLDER+"documents_categories.csv") \
  .alias('documents_categories').cache()

documents_categories_grouped_df = documents_categories_df.groupBy('document_id_cat') \
 .agg(F.collect_list('category_id').alias('category_id_list'),
   F.collect_list('confidence_level_cat').alias('confidence_level_cat_list')) \
 .withColumn('dummyDocumentsCategory', F.lit(1)) \
 .alias('documents_categories_grouped')

In [41]:
documents_topics_schema = StructType(
  [StructField("document_id_top", IntegerType(), True),
  StructField("topic_id", IntegerType(), True),                    
  StructField("confidence_level_top", FloatType(), True)]
  )

documents_topics_df = spark.read.schema(documents_topics_schema) \
  .options(header='true', inferschema='false', nullValue='\\N') \
  .csv(DATA_BUCKET_FOLDER+"documents_topics.csv") \
  .alias('documents_topics').cache()
    
documents_topics_grouped_df = documents_topics_df.groupBy('document_id_top') \
  .agg(F.collect_list('topic_id').alias('topic_id_list'),
    F.collect_list('confidence_level_top').alias('confidence_level_top_list')) \
  .withColumn('dummyDocumentsTopics', F.lit(1)) \
  .alias('documents_topics_grouped') 

In [42]:
documents_entities_schema = StructType(
  [StructField("document_id_ent", IntegerType(), True),
  StructField("entity_id", StringType(), True),                    
  StructField("confidence_level_ent", FloatType(), True)]
  )

documents_entities_df = spark.read.schema(documents_entities_schema) \
  .options(header='true', inferschema='false', nullValue='\\N') \
  .csv(DATA_BUCKET_FOLDER+"documents_entities.csv") \
  .alias('documents_entities').cache()
    
documents_entities_grouped_df = documents_entities_df.groupBy('document_id_ent') \
  .agg(F.collect_list('entity_id').alias('entity_id_list'),
    F.collect_list('confidence_level_ent').alias('confidence_level_ent_list')) \
  .withColumn('dummyDocumentsEntities', F.lit(1)) \
  .alias('documents_entities_grouped') 

In [43]:
clicks_train_schema = StructType(
  [StructField("display_id", IntegerType(), True),
  StructField("ad_id", IntegerType(), True),                    
  StructField("clicked", IntegerType(), True)]
  )

clicks_train_df = spark.read.schema(clicks_train_schema) \
  .options(header='true', inferschema='false', nullValue='\\N') \
  .csv(DATA_BUCKET_FOLDER+"clicks_train.csv") \
  .withColumn('dummyClicksTrain', F.lit(1)).alias('clicks_train')

In [44]:
clicks_train_joined_df = clicks_train_df \
  .join(promoted_content_df, on='ad_id', how='left') \
  .join(documents_meta_df, 
    on=F.col("promoted_content.document_id_promo") == F.col("documents_meta.document_id_doc"), 
    how='left') \
  .join(events_joined_df, on='display_id', how='left')                         
clicks_train_joined_df.createOrReplaceTempView('clicks_train_joined')

In [45]:
clicks_train_joined_df.take(2)

[Row(display_id=148, ad_id=89351, clicked=1, dummyClicksTrain=1, document_id_promo=990613, campaign_id=7617, advertiser_id=2181, dummyPromotedContent=1, source_id=9457, document_id_doc=990613, publish_time=datetime.datetime(2015, 12, 8, 16, 0), dummyDocumentsMeta=1, publisher_id=6123, uuid_event='9adce6a5363308', document_id_event=1205772, timestamp_event=11202, platform_event=2, geo_location_event='US>LA>612', dummyEvents=1, day_event=0, event_country='US', event_country_state='US>LA', source_id_doc_event=9135, document_id_doc=1205772, publish_time_doc_event=datetime.datetime(2016, 3, 29, 1, 0), dummyDocumentsMeta=1, publisher_doc_event=1042, uuid_pv='9adce6a5363308', document_id_pv=1205772, timestamp_pv=11202, platform_pv=2, geo_location_pv='US>LA>612', traffic_source_pv=1, day_pv=0),
 Row(display_id=148, ad_id=152656, clicked=0, dummyClicksTrain=1, document_id_promo=1086755, campaign_id=10511, advertiser_id=2151, dummyPromotedContent=1, source_id=7654, document_id_doc=1086755, publi

In [47]:
evaluation = True

In [48]:
if evaluation:
    table_name = 'user_profiles_eval'
else:
    table_name = 'user_profiles'

In [49]:
user_profiles_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+table_name) \
  .withColumn('dummyUserProfiles', F.lit(1)).alias('user_profiles')

# Spliting Train/validation set | Test set

In [50]:
if evaluation:       
    validation_set_exported_df = spark.read.parquet(
    OUTPUT_BUCKET_FOLDER+"validation_set.parquet") \
  .alias('validation_set') 
          
    validation_set_exported_df.select('display_id').distinct() \
    .createOrReplaceTempView("validation_display_ids")
  
  
    validation_set_df = spark.sql('''
      SELECT * FROM clicks_train_joined t 
      WHERE EXISTS (SELECT display_id FROM validation_display_ids 
      WHERE display_id = t.display_id)''').alias('clicks') \
    .join(documents_categories_grouped_df, 
      on=F.col("document_id_promo") == F.col("documents_categories_grouped.document_id_cat"), 
      how='left') \
    .join(documents_topics_grouped_df, 
      on=F.col("document_id_promo") == F.col("documents_topics_grouped.document_id_top"), 
      how='left') \
    .join(documents_entities_grouped_df, 
      on=F.col("document_id_promo") == F.col("documents_entities_grouped.document_id_ent"), 
      how='left') \
    .join(documents_categories_grouped_df \
        .withColumnRenamed('category_id_list', 'doc_event_category_id_list') \
        .withColumnRenamed('confidence_level_cat_list', 'doc_event_confidence_level_cat_list') \
        .alias('documents_event_categories_grouped'), 
      on=F.col("document_id_event") == F.col("documents_event_categories_grouped.document_id_cat"), 
      how='left') \
    .join(documents_topics_grouped_df \
        .withColumnRenamed('topic_id_list', 'doc_event_topic_id_list')
        .withColumnRenamed('confidence_level_top_list', 'doc_event_confidence_level_top_list') \
        .alias('documents_event_topics_grouped'), 
      on=F.col("document_id_event") == F.col("documents_event_topics_grouped.document_id_top"), 
      how='left') \
    .join(documents_entities_grouped_df \
        .withColumnRenamed('entity_id_list', 'doc_event_entity_id_list')
        .withColumnRenamed('confidence_level_ent_list', 'doc_event_confidence_level_ent_list') \
        .alias('documents_event_entities_grouped'), 
      on=F.col("document_id_event") == F.col("documents_event_entities_grouped.document_id_ent"), 
      how='left') \
    .join(page_views_users_df, 
      on=[F.col("clicks.uuid_event") == F.col("page_views_users.uuid_pv"),
        F.col("clicks.document_id_promo") == F.col("page_views_users.document_id_pv")], 
      how='left')
  
  #print("validation_set_df.count() =", validation_set_df.count())
      
  #Added to validation set information about the event and the user for statistics of the error (avg ctr)
    validation_set_ground_truth_df = validation_set_df.filter('clicked = 1') \
    .join(user_profiles_df, 
      on=[F.col("user_profiles.uuid") == F.col("uuid_event")], 
      how='left') \
    .withColumn('user_categories_count', list_len_udf('category_id_list')) \
    .withColumn('user_topics_count', list_len_udf('topic_id_list')) \
    .withColumn('user_entities_count', list_len_udf('entity_id_list')) \
    .select('display_id','ad_id','platform_event', 'day_event', 'timestamp_event', 
      'geo_location_event', 'event_country', 'event_country_state', 'views',
      'user_categories_count', 'user_topics_count', 'user_entities_count') \
    .withColumnRenamed('ad_id','ad_id_gt') \
    .withColumnRenamed('views','user_views_count') \
    .cache()
  #print("validation_set_ground_truth_df.count() =", validation_set_ground_truth_df.count())
  
    train_set_df = spark.sql('''
    SELECT * FROM clicks_train_joined t 
    WHERE NOT EXISTS (SELECT display_id FROM validation_display_ids 
    WHERE display_id = t.display_id)''').cache()
    print("train_set_df.count() =", train_set_df.count())
    
else:
    
    clicks_test_schema = StructType(
    [StructField("display_id", IntegerType(), True),
    StructField("ad_id", IntegerType(), True)]
    )

    clicks_test_df = spark.read.schema(clicks_test_schema) \
    .options(header='true', inferschema='false', nullValue='\\N') \
    .csv(DATA_BUCKET_FOLDER + "clicks_test.csv") \
    .withColumn('dummyClicksTest', F.lit(1)) \
    .withColumn('clicked', F.lit(-999)) \
    .alias('clicks_test')
      
      
    test_set_df = clicks_test_df \
    .join(promoted_content_df, on='ad_id', how='left') \
    .join(documents_meta_df, 
      on=F.col("promoted_content.document_id_promo") == F.col("documents_meta.document_id_doc"),
      how='left') \
    .join(documents_categories_grouped_df, 
      on=F.col("document_id_promo") == F.col("documents_categories_grouped.document_id_cat"), 
      how='left') \
    .join(documents_topics_grouped_df, 
      on=F.col("document_id_promo") == F.col("documents_topics_grouped.document_id_top"), 
      how='left') \
    .join(documents_entities_grouped_df, 
      on=F.col("document_id_promo") == F.col("documents_entities_grouped.document_id_ent"), 
      how='left') \
    .join(events_joined_df, on='display_id', how='left') \
    .join(documents_categories_grouped_df \
        .withColumnRenamed('category_id_list', 'doc_event_category_id_list')
        .withColumnRenamed('confidence_level_cat_list', 'doc_event_confidence_level_cat_list') \
        .alias('documents_event_categories_grouped'), 
      on=F.col("document_id_event") == F.col("documents_event_categories_grouped.document_id_cat"), 
      how='left') \
    .join(documents_topics_grouped_df \
        .withColumnRenamed('topic_id_list', 'doc_event_topic_id_list')
        .withColumnRenamed('confidence_level_top_list', 'doc_event_confidence_level_top_list') \
        .alias('documents_event_topics_grouped'), 
      on=F.col("document_id_event") == F.col("documents_event_topics_grouped.document_id_top"), 
      how='left') \
    .join(documents_entities_grouped_df \
        .withColumnRenamed('entity_id_list', 'doc_event_entity_id_list')
        .withColumnRenamed('confidence_level_ent_list', 'doc_event_confidence_level_ent_list') \
        .alias('documents_event_entities_grouped'), 
      on=F.col("document_id_event") == F.col("documents_event_entities_grouped.document_id_ent"), 
      how='left') \
    .join(page_views_users_df, 
      on=[F.col("events.uuid_event") == F.col("page_views_users.uuid_pv"),
        F.col("promoted_content.document_id_promo") == F.col("page_views_users.document_id_pv")], 
      how='left')

    train_set_df = clicks_train_joined_df.cache() 
    print("train_set_df.count() =", train_set_df.count())       


train_set_df.count() = 59761827


# Training models

In [51]:
def is_null(value):
    return value is None or len(str(value).strip()) == 0

In [52]:
LESS_SPECIAL_CAT_VALUE = 'less'
def get_category_field_values_counts(field, df, min_threshold=10):
    category_counts = dict(list(filter(lambda x: not is_null(x[0]) and x[1] >= min_threshold, df.select(field).groupBy(field).count().rdd.map(lambda x: (x[0], x[1])).collect())))
    #Adding a special value to create a feature for values in this category that are less than min_threshold 
    category_counts[LESS_SPECIAL_CAT_VALUE] = -1
    return category_counts

## Building category values counters and indexers

In [53]:
event_country_values_counts = get_category_field_values_counts('event_country', events_df, min_threshold=10)
len(event_country_values_counts)

222

In [54]:
event_country_state_values_counts = get_category_field_values_counts('event_country_state', events_df, min_threshold=10)
len(event_country_state_values_counts)

1892

In [55]:
event_geo_location_values_counts = get_category_field_values_counts('geo_location_event', events_df, min_threshold=10)
len(event_geo_location_values_counts)

2273

In [56]:
doc_entity_id_values_counts = get_category_field_values_counts('entity_id', documents_entities_df, min_threshold=10)
len(doc_entity_id_values_counts)

52439

## Processing average CTR by categories

In [57]:
def get_percentiles(df, field, quantiles_levels=None, max_error_rate=0.0):
    if quantiles_levels is None:
        quantiles_levels = np.arange(0.0, 1.1, 0.1).tolist() 
    quantiles = df.approxQuantile(field, quantiles_levels, max_error_rate)
    return dict(zip(quantiles_levels, quantiles))

In [58]:
#REG = 10
REG = 0
ctr_udf = F.udf(lambda clicks, views: clicks / float(views + REG), FloatType())

### Average CTR by ad_id

In [59]:
ad_id_popularity_df = train_set_df.groupby('ad_id').agg(F.sum('clicked').alias('clicks'), 
                                                               F.count('*').alias('views')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))


In [60]:
ad_id_popularity = ad_id_popularity_df.filter('views > 5').select('ad_id', 'ctr', 'views') \
  .rdd.map(lambda x: (x['ad_id'], (x['ctr'], x['views'], 1, 1))).collectAsMap()


In [61]:
ad_id_popularity_broad = sc.broadcast(ad_id_popularity)

In [62]:
list(ad_id_popularity.values())[:3]

[(0.06668111681938171, 13842, 1, 1),
 (0.3735618591308594, 33550, 1, 1),
 (0.10664335638284683, 572, 1, 1)]

In [63]:
len(ad_id_popularity)

192108

In [64]:
ad_id_avg_ctr = sum(map(lambda x: x[0], ad_id_popularity.values())) / float(len(ad_id_popularity))
ad_id_avg_ctr

0.1552832145956732

In [65]:
ad_id_weighted_avg_ctr = sum(map(lambda x: x[0]*x[1], ad_id_popularity.values())) / float(sum(map(lambda x: x[1], ad_id_popularity.values())))
ad_id_weighted_avg_ctr

0.1940534152412439

In [66]:
ad_id_views_median = np.median(np.array(list(map(lambda x: x[1], ad_id_popularity.values()))))
ad_id_views_median

18.0

In [67]:
ad_id_views_mean = sum(map(lambda x: x[1], ad_id_popularity.values())) / float(len(ad_id_popularity))
ad_id_views_mean

308.5930986736627

### Average CTR by document_id (promoted_content)

In [68]:
document_id_popularity_df = train_set_df \
  .groupby('document_id_promo') \
  .agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'),
    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
  .withColumn('ctr', ctr_udf('clicks','views'))

In [69]:
document_id_popularity = document_id_popularity_df.filter('views > 5') \
  .select('document_id_promo', 'ctr', 'views', 'distinct_ad_ids') \
  .rdd.map(lambda x: (x['document_id_promo'], 
    (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()

In [70]:
len(document_id_popularity)    

74767

In [71]:
document_id_popularity_broad = sc.broadcast(document_id_popularity)

In [72]:
document_id_avg_ctr = sum(map(lambda x: x[0], document_id_popularity.values())) / float(len(document_id_popularity))
document_id_avg_ctr

0.1504891450153877

In [73]:
document_id_weighted_avg_ctr = sum(list(map(lambda x: x[0]*x[1], document_id_popularity.values()))) / float(sum(list(map(lambda x: x[1], document_id_popularity.values()))))
document_id_weighted_avg_ctr

0.19380680502581238

In [74]:
document_id_views_median = np.median(np.array(list(map(lambda x: x[1], document_id_popularity.values()))))
document_id_views_median

28.0

In [75]:
document_id_views_mean = sum(map(lambda x: x[1], document_id_popularity.values())) / float(len(document_id_popularity))
document_id_views_mean

797.3909746278439

###  Average CTR by (doc_event, doc_ad)

In [76]:
doc_event_doc_ad_avg_ctr_df = train_set_df.groupBy('document_id_event', 'document_id_promo') \
  .agg(F.sum('clicked').alias('clicks'), 
    F.count('*').alias('views'), F.countDistinct('ad_id').alias('distinct_ad_ids')) \
  .withColumn('ctr', ctr_udf('clicks','views'))

In [77]:
doc_event_doc_ad_avg_ctr = doc_event_doc_ad_avg_ctr_df.filter('views > 5') \
  .select('document_id_event', 'document_id_promo','ctr', 'views', 'distinct_ad_ids') \
  .rdd.map(lambda x: ((x['document_id_event'], x['document_id_promo']), 
    (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()

In [78]:
len(doc_event_doc_ad_avg_ctr)

1302456

In [79]:
doc_event_doc_ad_avg_ctr_broad = sc.broadcast(doc_event_doc_ad_avg_ctr)

### Average CTR by country, source_id

In [80]:
source_id_by_country_popularity_df = train_set_df \
  .select('clicked', 'source_id', 'event_country', 'ad_id') \
  .groupby('event_country', 'source_id') \
  .agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'),
    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
  .withColumn('ctr', ctr_udf('clicks','views'))

In [81]:
#source_id_popularity = source_id_popularity_df.filter('views > 100 and source_id is not null').select('source_id', 'ctr').rdd.collectAsMap()
source_id_by_country_popularity = source_id_by_country_popularity_df.filter('views > 5 and source_id is not null and event_country <> ""').select('event_country', 'source_id', 'ctr', 'views', 'distinct_ad_ids')         .rdd.map(lambda x: ((x['event_country'], x['source_id']), (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()
len(source_id_by_country_popularity)  

29856

In [82]:
source_id_by_country_popularity_broad = sc.broadcast(source_id_by_country_popularity)

In [83]:
source_id_by_country_avg_ctr = sum(map(lambda x: x[0], source_id_by_country_popularity.values())) / float(len(source_id_by_country_popularity))
source_id_by_country_avg_ctr

0.1860297896776317

In [84]:
source_id_by_country_weighted_avg_ctr = sum(map(lambda x: x[0]*x[1], source_id_by_country_popularity.values())) / float(sum(map(lambda x: x[1], source_id_by_country_popularity.values())))
source_id_by_country_weighted_avg_ctr

0.19364919644870532

In [85]:
source_id_by_country_views_median = np.median(np.array(list(map(lambda x: x[1], source_id_by_country_popularity.values()))))
source_id_by_country_views_median

38.0

In [86]:
source_id_by_country_views_mean = sum(map(lambda x: x[1], source_id_by_country_popularity.values())) / float(len(source_id_by_country_popularity))
source_id_by_country_views_mean

1999.1519962486602

### Average CTR by source_id

In [87]:
source_id_popularity_df = train_set_df.select('clicked', 'source_id', 'ad_id') \
  .groupby('source_id').agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'),
    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
  .withColumn('ctr', ctr_udf('clicks','views'))

In [88]:
source_id_popularity = source_id_popularity_df \
  .filter('views > 10 and source_id is not null') \
  .select('source_id', 'ctr', 'views', 'distinct_ad_ids') \
  .rdd.map(lambda x: (x['source_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))) \
  .collectAsMap()

In [89]:
len(source_id_popularity)

5628

In [90]:
source_id_popularity_broad = sc.broadcast(source_id_popularity)

### Average CTR by publisher_id

In [91]:
publisher_popularity_df = train_set_df.select('clicked', 'publisher_id', 'ad_id') \
  .groupby('publisher_id').agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'),
    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
  .withColumn('ctr', ctr_udf('clicks','views'))

In [92]:
publisher_popularity = publisher_popularity_df \
  .filter('views > 10 and publisher_id is not null') \
  .select('publisher_id', 'ctr', 'views', 'distinct_ad_ids') \
  .rdd.map(lambda x: (x['publisher_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))) \
  .collectAsMap()

In [93]:
len(publisher_popularity)

4671

In [94]:
publisher_popularity_broad = sc.broadcast(publisher_popularity)

### Average CTR by advertiser_id

In [95]:
advertiser_id_popularity_df = train_set_df.select('clicked', 'advertiser_id', 'ad_id') \
  .groupby('advertiser_id').agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'),
    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
  .withColumn('ctr', ctr_udf('clicks','views'))

In [96]:
advertiser_id_popularity = advertiser_id_popularity_df \
  .filter('views > 10 and advertiser_id is not null') \
  .select('advertiser_id', 'ctr', 'views', 'distinct_ad_ids') \
    .rdd.map(lambda x: (x['advertiser_id'], 
      (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()

In [97]:
len(advertiser_id_popularity)

3620

In [98]:
advertiser_id_popularity_broad = sc.broadcast(advertiser_id_popularity)

### Average CTR by campaign_id

In [99]:
campaign_id_popularity_df = train_set_df.select('clicked', 'campaign_id', 'ad_id') \
  .groupby('campaign_id').agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'),
    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
  .withColumn('ctr', ctr_udf('clicks','views'))

In [100]:
campaign_id_popularity = campaign_id_popularity_df \
  .filter('views > 10 and campaign_id is not null') \
  .select('campaign_id', 'ctr', 'views', 'distinct_ad_ids') \
  .rdd.map(lambda x: (x['campaign_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))) \
  .collectAsMap()

In [101]:
len(campaign_id_popularity)               

25270

In [102]:
campaign_id_popularity_broad = sc.broadcast(campaign_id_popularity)

### Average CTR by category

In [103]:
category_id_popularity_df = train_set_df.join(
    documents_categories_df.alias('cat_local'), 
    on=F.col("document_id_promo") == F.col("cat_local.document_id_cat"), how='inner') \
  .select('clicked', 'category_id', 'confidence_level_cat', 'ad_id') \
  .groupby('category_id').agg(F.sum('clicked').alias('clicks'), 
    F.count('*').alias('views'),
    F.mean('confidence_level_cat').alias('avg_confidence_level_cat'),
    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
  .withColumn('ctr', ctr_udf('clicks','views'))

In [104]:
category_id_popularity = category_id_popularity_df.filter('views > 10') \
  .select('category_id', 'ctr', 'views', 'avg_confidence_level_cat', 'distinct_ad_ids') \
  .rdd.map(lambda x: (x['category_id'], 
    (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_cat']))).collectAsMap()          

In [105]:
len(category_id_popularity)

95

In [106]:
category_id_popularity_broad = sc.broadcast(category_id_popularity)

In [107]:
list(category_id_popularity.values())[:5]

[(0.24857684969902039, 29512, 261, 0.2659369122112505),
 (0.1463375687599182, 306367, 1864, 0.3836574482880412),
 (0.2693001329898834, 1864184, 15180, 0.6951355794027785),
 (0.2103358805179596, 2178991, 16646, 0.5190177568460855),
 (0.1664503812789917, 167990, 2003, 0.07435643983049306)]

In [108]:
np.median(np.array(list(map(lambda x: x[1], category_id_popularity.values()))))

692507.0

In [109]:
sum(map(lambda x: x[1], category_id_popularity.values())) / float(len(category_id_popularity))

1246361.957894737

### Average CTR by (country, category)

In [110]:
category_id_by_country_popularity_df = train_set_df \
  .join(documents_categories_df.alias('cat_local'), 
    on=F.col("document_id_promo") == F.col("cat_local.document_id_cat"), how='inner') \
  .select('clicked', 'category_id', 'confidence_level_cat', 'event_country', 'ad_id') \
  .groupby('event_country','category_id').agg(F.sum('clicked').alias('clicks'), 
    F.count('*').alias('views'), 
    F.mean('confidence_level_cat').alias('avg_confidence_level_cat'),
    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
  .withColumn('ctr', ctr_udf('clicks','views'))

In [111]:
category_id_by_country_popularity = category_id_by_country_popularity_df \
  .filter('views > 10 and event_country <> ""') \
  .select('event_country', 'category_id', 'ctr', 'views', 'avg_confidence_level_cat', 
    'distinct_ad_ids') \
  .rdd.map(lambda x: ((x['event_country'], x['category_id']), 
    (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_cat']))).collectAsMap()

In [112]:
len(category_id_by_country_popularity)

10987

In [113]:
category_id_by_country_popularity_broad = sc.broadcast(category_id_by_country_popularity)

### Average CTR by Topic

In [114]:
topic_id_popularity_df = train_set_df.join(
    documents_topics_df.alias('top_local'), 
    on=F.col("document_id_promo") == F.col("top_local.document_id_top"), how='inner') \
  .select('clicked', 'topic_id', 'confidence_level_top', 'ad_id') \
  .groupby('topic_id').agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'),
    F.mean('confidence_level_top').alias('avg_confidence_level_top'),
    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
  .withColumn('ctr', ctr_udf('clicks','views'))

In [115]:
topic_id_popularity = topic_id_popularity_df.filter('views > 10') \
  .select('topic_id', 'ctr', 'views', 'avg_confidence_level_top', 'distinct_ad_ids') \
  .rdd.map(lambda x: (x['topic_id'], \
    (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_top']))).collectAsMap()                                                                              

In [116]:
len(topic_id_popularity)

300

In [117]:
topic_id_popularity_broad = sc.broadcast(topic_id_popularity)

In [118]:
sum(map(lambda x: x[1], topic_id_popularity.values())) / float(len(topic_id_popularity))

526644.25

In [119]:
sum(map(lambda x: x[2]*x[1], topic_id_popularity.values())) / float(len(topic_id_popularity))

6998840613.206667

### Average CTR by (country, topic)

In [120]:
topic_id_by_country_popularity_df = train_set_df.join(
    documents_topics_df.alias('top_local'), 
    on=F.col("document_id_promo") == F.col("top_local.document_id_top"), how='inner') \
  .select('clicked', 'topic_id', 'confidence_level_top','event_country', 'ad_id') \
  .groupby('event_country','topic_id').agg(F.sum('clicked').alias('clicks'), 
    F.count('*').alias('views'), 
    F.mean('confidence_level_top').alias('avg_confidence_level_top'),
    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
  .withColumn('ctr', ctr_udf('clicks','views'))

In [121]:
topic_id_id_by_country_popularity = topic_id_by_country_popularity_df \
  .filter('views > 10 and event_country <> ""') \
  .select('event_country', 'topic_id', 'ctr', 'views', 'avg_confidence_level_top', 
    'distinct_ad_ids') \
  .rdd.map(lambda x: ((x['event_country'], x['topic_id']), 
    (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_top']))).collectAsMap()


In [122]:
len(topic_id_id_by_country_popularity) 

33071

In [123]:
topic_id_id_by_country_popularity_broad = sc.broadcast(topic_id_id_by_country_popularity)

### Average CTR by Entity

In [124]:
entity_id_popularity_df = train_set_df.join(
    documents_entities_df.alias('ent_local'), 
    on=F.col("document_id_promo") == F.col("ent_local.document_id_ent"), how='inner') \
  .select('clicked', 'entity_id', 'confidence_level_ent', 'ad_id') \
  .groupby('entity_id').agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'),
    F.mean('confidence_level_ent').alias('avg_confidence_level_ent'),
    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
  .withColumn('ctr', ctr_udf('clicks','views'))

In [125]:
entity_id_popularity = entity_id_popularity_df.filter('views > 5') \
  .select('entity_id', 'ctr', 'views', 'avg_confidence_level_ent', 'distinct_ad_ids') \
  .rdd.map(lambda x: (x['entity_id'], 
    (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_ent']))).collectAsMap()

In [126]:
len(entity_id_popularity)

78120

In [127]:
entity_id_popularity_broad = sc.broadcast(entity_id_popularity)

In [128]:
np.median(np.array(list(map(lambda x: x[1], entity_id_popularity.values()))))

48.0

In [129]:
sum(map(lambda x: x[1], entity_id_popularity.values())) / float(len(entity_id_popularity))

1915.9886584741423

### Average CTR by (country, entity)

In [130]:
entity_id_by_country_popularity_df = train_set_df.join(
    documents_entities_df.alias('ent_local'), 
    on=F.col("document_id_promo") == F.col("ent_local.document_id_ent"), how='inner') \
  .select('clicked', 'entity_id', 'event_country', 'confidence_level_ent','ad_id') \
  .groupby('event_country','entity_id').agg(F.sum('clicked').alias('clicks'), 
    F.count('*').alias('views'),
    F.mean('confidence_level_ent').alias('avg_confidence_level_ent'),
    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
  .withColumn('ctr', ctr_udf('clicks','views'))

In [131]:
entity_id_by_country_popularity = entity_id_by_country_popularity_df \
  .filter('views > 5 and event_country <> ""') \
  .select('event_country', 'entity_id', 'ctr', 'views', 'avg_confidence_level_ent', 
    'distinct_ad_ids') \
  .rdd.map(lambda x: ((x['event_country'], x['entity_id']),
    (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_ent']))).collectAsMap()

In [132]:
len(entity_id_by_country_popularity) 

217703

In [133]:
entity_id_by_country_popularity_broad = sc.broadcast(entity_id_by_country_popularity)

In [134]:
import _pickle as cPickle

In [135]:
df_filenames_suffix = ''
if evaluation:
    df_filenames_suffix = '_eval'

In [136]:
with open('../data/outbrain/preprocessed'+'categories_docs_counts'+df_filenames_suffix+'.pickle', 'rb') as input_file:
    categories_docs_counts = cPickle.load(input_file)    
len(categories_docs_counts)

97

In [137]:
with open('../data/outbrain/preprocessed'+'topics_docs_counts'+df_filenames_suffix+'.pickle', 'rb') as input_file:
    topics_docs_counts = cPickle.load(input_file)
len(topics_docs_counts)

300

In [138]:
with open('../data/outbrain/preprocessed'+'entities_docs_counts'+df_filenames_suffix+'.pickle', 'rb') as input_file:
    entities_docs_counts = cPickle.load(input_file)
len(entities_docs_counts)

1326009

In [139]:
documents_total = documents_meta_df.count()
documents_total

2996816

## Exploring Publish Time

In [140]:
publish_times_df = train_set_df.filter('publish_time is not null') \
.select('document_id_promo','publish_time') \
.distinct().select(F.col('publish_time').cast(IntegerType()))

In [141]:
publish_times_df.show(5)

+------------+
|publish_time|
+------------+
|  1435593600|
|  1441814400|
|  1458144000|
|  1464710400|
|  1446912000|
+------------+
only showing top 5 rows



In [142]:
publish_time_percentiles = get_percentiles(publish_times_df, 'publish_time', quantiles_levels=[0.5], max_error_rate=0.001)

In [143]:
publish_time_percentiles

{0.5: 1464105600.0}

In [144]:
publish_time_median = int(publish_time_percentiles[0.5])

In [145]:
datetime.datetime.utcfromtimestamp(publish_time_median)

datetime.datetime(2016, 5, 24, 16, 0)

In [146]:
def get_days_diff(newer_timestamp, older_timestamp):
    sec_diff = newer_timestamp - older_timestamp
    days_diff = sec_diff / 60 / 60 / 24
    return days_diff

In [147]:
# need to think
def get_time_decay_factor(timestamp, timestamp_ref=None, alpha=0.001):
    if timestamp_ref is None:
        timestamp_ref = time.time()
        
    days_diff = get_days_diff(timestamp_ref, timestamp)
    denominator = math.pow(1+alpha, days_diff)
    if denominator != 0:
        return 1.0 / denominator
    else:
        return 0.0

In [148]:
def convert_odd_timestamp(timestamp_ms_relative):
    TIMESTAMP_DELTA=1465876799998
    return datetime.datetime.fromtimestamp((int(timestamp_ms_relative)+TIMESTAMP_DELTA)//1000)

In [149]:
TIME_DECAY_ALPHA = 0.0005

In [150]:
ref_dates = [
                1476714880, # 7 days
                1474727680, # 30 days
                1469370880, # 90 days
                1461508480,  # 180 days
                1445697280, # 1 year
                1414161280 # 2 years
]

In [151]:
for d in ref_dates:
    print(datetime.datetime.utcfromtimestamp(d), get_time_decay_factor(d, alpha=TIME_DECAY_ALPHA))

2016-10-17 14:34:40 0.5092478658812107
2016-09-24 14:34:40 0.503426507591573
2016-07-24 14:34:40 0.48806348316584197
2016-04-24 14:34:40 0.46635952850625007
2015-10-24 14:34:40 0.42559139718160316
2014-10-24 14:34:40 0.35461239015004703


### Get local time

In [152]:
DEFAULT_TZ_EST = -4.0

In [153]:
def get_local_utc_bst_tz(event_country, event_country_state):
    local_tz = DEFAULT_TZ_EST
    if len(event_country) > 0:
        if event_country in countries_utc_dst_broad.value:
            local_tz = countries_utc_dst_broad.value[event_country]
            if len(event_country_state)>2:
                state = event_country_state[3:5]
                if event_country == 'US':  
                    if state in us_states_utc_dst_broad.value:
                        local_tz = us_states_utc_dst_broad.value[state]                
                elif event_country == 'CA':
                    if state in ca_countries_utc_dst_broad.value:
                        local_tz = ca_countries_utc_dst_broad.value[state] 
    return float(local_tz)


In [154]:
get_local_utc_bst_tz('US', 'US>WV')

-4.0

In [155]:
hour_bins_dict = {'EARLY_MORNING': 0,
             'MORNING': 1,
             'MIDDAY': 2,
             'AFTERNOON': 3,
             'EVENING': 4,
             'NIGHT': 5}

In [156]:
hour_bins_values = sorted(hour_bins_dict.values())

In [157]:
def get_hour_bin(hour):
    if hour >= 5 and hour < 8:
        hour_bin = hour_bins_dict['EARLY_MORNING']
    elif hour >= 8 and hour < 11:
        hour_bin = hour_bins_dict['MORNING']
    elif hour >= 11 and hour < 14:
        hour_bin = hour_bins_dict['MIDDAY']
    elif hour >= 14 and hour < 19:
        hour_bin = hour_bins_dict['AFTERNOON']
    elif hour >= 19 and hour < 22:
        hour_bin = hour_bins_dict['EVENING']
    else:
        hour_bin = hour_bins_dict['NIGHT']
    return hour_bin

In [158]:
def get_local_datetime(dt, event_country, event_country_state):
    local_tz = get_local_utc_bst_tz(event_country, event_country_state)  
    tz_delta = local_tz - DEFAULT_TZ_EST
    local_time = dt +  datetime.timedelta(hours=tz_delta)
    return local_time

In [159]:
get_local_datetime(datetime.datetime.now(), 'US', 'US>CA')

datetime.datetime(2020, 6, 28, 11, 3, 57, 913950)

In [160]:
def is_weekend(dt):
    return dt.weekday() >= 5

In [161]:
timestamp_ref = date_time_to_unix_epoch(datetime.datetime(2016, 6, 29, 3, 59, 59))

In [162]:
timestamp_ref

1467172799

In [163]:
decay_factor_default = get_time_decay_factor(publish_time_median, timestamp_ref, alpha=TIME_DECAY_ALPHA)

In [164]:
print("decay_factor_default:", decay_factor_default)

decay_factor_default: 0.98241096698168


In [165]:
#need to think
def get_confidence_sample_size(sample, max_for_reference=100000):
    #Avoiding overflow for large sample size
    if sample >= max_for_reference:
        return 1.0

    ref_log = math.log(1+max_for_reference, 2) #Curiosly reference in log  with base 2 gives a slightly higher score, so I will keep
    
    return math.log(1+sample) / float(ref_log)

In [166]:
for i in [0,0.5,1,2,3,4,5,10,20,30,100,200,300,1000,2000,3000,10000,20000,30000, 50000, 90000, 100000, 500000, 900000, 1000000, 2171607]:
    print(i, get_confidence_sample_size(i))

0 0.0
0.5 0.024411410743763327
1 0.041731582304281624
2 0.06614299304804495
3 0.08346316460856325
4 0.09689773339641579
5 0.10787457535232657
10 0.14436755531919657
20 0.183298356035222
30 0.20674645107847822
100 0.2778577004917695
200 0.3192904933647466
300 0.34360197720285013
1000 0.41594812296601125
2000 0.4576496248565576
3000 0.48205100545505175
10000 0.5545232830964639
20000 0.5962518553291584
30000 0.6206622626822822
50000 0.6514162003061013
90000 0.6868039178501281
100000 1.0
500000 1.0
900000 1.0
1000000 1.0
2171607 1.0


In [167]:
def get_popularity(an_id, a_dict):
    return (a_dict[an_id][0], get_confidence_sample_size(a_dict[an_id][1] / float(a_dict[an_id][2])) * a_dict[an_id][3]) if an_id in a_dict else (None, None)   



In [168]:
ad_id_popularity_broad.value[155510]

(0.0833333358168602, 168, 1, 1)

In [169]:
get_popularity(155510, ad_id_popularity_broad.value)

(0.0833333358168602, 0.3088504093192754)

In [170]:
def get_weighted_avg_popularity_from_list(ids_list, confidence_ids_list, pop_dict):
    pops = list(filter(lambda x: x[0][0]!=None, [(get_popularity(an_id, pop_dict), confidence) for an_id, confidence in zip(ids_list, confidence_ids_list)]))
    #print("pops",pops)
    if len(pops) > 0:
        weighted_avg = sum(map(lambda x: x[0][0]*x[0][1]*x[1], pops)) / float(sum(map(lambda x: x[0][1]*x[1], pops)))
        confidence = max(map(lambda x: x[0][1]*x[1], pops))
        return weighted_avg, confidence
    else:
        return None, None

In [171]:
def get_weighted_avg_country_popularity_from_list(event_country, ids_list, confidence_ids_list, pop_dict):
    pops = list(filter(lambda x: x[0][0]!=None, [(get_popularity((event_country, an_id), pop_dict), confidence) for an_id, confidence in zip(ids_list, confidence_ids_list)]))
    
    if len(pops) > 0:
        weighted_avg = sum(map(lambda x: x[0][0]*x[0][1]*x[1], pops)) / float(sum(map(lambda x: x[0][1]*x[1], pops)))
        confidence = max(map(lambda x: x[0][1]*x[1], pops))
        return weighted_avg, confidence
    else:
        return None, None

In [172]:
def get_popularity_score(event_country, ad_id, document_id, source_id, 
      publisher_id, advertiser_id, campaign_id, document_id_event,
      category_ids_by_doc, cat_confidence_level_by_doc, 
      topic_ids_by_doc, top_confidence_level_by_doc,
      entity_ids_by_doc, ent_confidence_level_by_doc,
      output_detailed_list=False):
    probs = []
    
    avg_ctr, confidence = get_popularity(ad_id, ad_id_popularity_broad.value)    
    if avg_ctr != None:
        probs.append(('pop_ad_id', avg_ctr, confidence))
        
    avg_ctr, confidence = get_popularity(document_id, document_id_popularity_broad.value)
    if avg_ctr != None:
        probs.append(('pop_document_id', avg_ctr, confidence))  
        
    avg_ctr, confidence = get_popularity((document_id_event, document_id), doc_event_doc_ad_avg_ctr_broad.value)
    if avg_ctr != None:
        probs.append(('pop_doc_event_doc_ad', avg_ctr, confidence))
        
        
    if source_id != -1:
        avg_ctr = None
        if event_country != '':
            avg_ctr, confidence = get_popularity((event_country, source_id), source_id_by_country_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_source_id_country', avg_ctr, confidence))
            
        avg_ctr, confidence = get_popularity(source_id, source_id_popularity_broad.value)        
        if avg_ctr != None:
            probs.append(('pop_source_id', avg_ctr, confidence))
            
            
    if publisher_id != None:
        avg_ctr, confidence = get_popularity(publisher_id, publisher_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_publisher_id', avg_ctr, confidence)) 
            
    if advertiser_id != None:
        avg_ctr, confidence = get_popularity(advertiser_id, advertiser_id_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_advertiser_id', avg_ctr, confidence)) 
    
    if campaign_id != None:
        avg_ctr, confidence = get_popularity(campaign_id, campaign_id_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_campain_id', avg_ctr, confidence))  

    if len(entity_ids_by_doc) > 0: 
        avg_ctr = None
        if event_country != '':
            avg_ctr, confidence = get_weighted_avg_country_popularity_from_list(
              event_country, entity_ids_by_doc, ent_confidence_level_by_doc, 
              entity_id_by_country_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_entity_id_country', avg_ctr, confidence))
            
        avg_ctr, confidence = get_weighted_avg_popularity_from_list(
          entity_ids_by_doc, ent_confidence_level_by_doc, 
          entity_id_popularity_broad.value) 
        if avg_ctr != None:
            probs.append(('pop_entity_id', avg_ctr, confidence))
            
    
    
    if len(topic_ids_by_doc) > 0:  
        avg_ctr = None
        if event_country != '':
            avg_ctr, confidence = get_weighted_avg_country_popularity_from_list(
              event_country, topic_ids_by_doc, top_confidence_level_by_doc, 
              topic_id_id_by_country_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_topic_id_country', avg_ctr, confidence))
            
        avg_ctr, confidence = get_weighted_avg_popularity_from_list(
          topic_ids_by_doc, top_confidence_level_by_doc, 
          topic_id_popularity_broad.value)            
        if avg_ctr != None:
            probs.append(('pop_topic_id', avg_ctr, confidence))
    
    
    if len(category_ids_by_doc) > 0:  
        avg_ctr = None
        if event_country != '':
            avg_ctr, confidence = get_weighted_avg_country_popularity_from_list(
              event_country, category_ids_by_doc, cat_confidence_level_by_doc, 
              category_id_by_country_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_category_id_country', avg_ctr, confidence))
        
        avg_ctr, confidence = get_weighted_avg_popularity_from_list(
          category_ids_by_doc, cat_confidence_level_by_doc, 
          category_id_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_category_id', avg_ctr, confidence))
    
    #print("[get_popularity_score] probs", probs)
    if output_detailed_list:
        return probs
    
    else:    
        if len(probs) > 0:
            #weighted_avg_probs_by_confidence = sum(map(lambda x: x[1] *  math.log(1+x[2],2), probs)) / float(sum(map(lambda x: math.log(1+x[2],2), probs)))        
            weighted_avg_probs_by_confidence = sum(map(lambda x: x[1] * x[2], probs)) / float(sum(map(lambda x: x[2], probs)))                
            confidence = max(map(lambda x: x[2], probs))
            return weighted_avg_probs_by_confidence, confidence
        else:
            return None, None    


## Content-Based similarity functions

In [173]:
def cosine_similarity_dicts(dict1, dict2):
    dict1_norm = math.sqrt(sum([v**2 for v in dict1.values()]))
    dict2_norm = math.sqrt(sum([v**2 for v in dict2.values()]))
    
    sum_common_aspects = 0.0
    intersections = 0
    for key in dict1:
        if key in dict2:
            sum_common_aspects += dict1[key] * dict2[key] 
            intersections += 1
        
    return sum_common_aspects / (dict1_norm * dict2_norm), intersections


In [174]:
def cosine_similarity_user_docs_aspects(user_aspect_profile, doc_aspect_ids, doc_aspects_confidence, aspect_docs_counts):
    if user_aspect_profile is None or len(user_aspect_profile) == 0 or doc_aspect_ids is None or len(doc_aspect_ids) == 0:
        return None, None
        
    doc_aspects = dict(zip(doc_aspect_ids, doc_aspects_confidence))
    doc_aspects_tfidf_confid = {}
    for key in doc_aspects:
        tf = 1.0
        idf = math.log(math.log(documents_total / float(aspect_docs_counts[key])))
        confidence = doc_aspects[key]
        doc_aspects_tfidf_confid[key] = tf*idf * confidence
        
    user_aspects_tfidf_confid = {}    
    for key in user_aspect_profile:
        tfidf = user_aspect_profile[key][0]
        confidence = user_aspect_profile[key][1]
        user_aspects_tfidf_confid[key] = tfidf * confidence
        
    similarity, intersections = cosine_similarity_dicts(doc_aspects_tfidf_confid, user_aspects_tfidf_confid)
    
    if intersections > 0:
        #P(A intersect B)_intersections = P(A)^intersections * P(B)^intersections
        random_error = math.pow(len(doc_aspects) / float(len(aspect_docs_counts)), 
          intersections) * math.pow(len(user_aspect_profile) / float(len(aspect_docs_counts)), 
          intersections)
    else:
        #P(A not intersect B) = 1 - P(A intersect B)
        random_error = 1 - ((len(doc_aspects) / float(len(aspect_docs_counts))) * 
          (len(user_aspect_profile) / float(len(aspect_docs_counts))))
    
    confidence = 1.0 - random_error    
    
    return similarity, confidence


In [175]:
def cosine_similarity_doc_event_doc_ad_aspects(doc_event_aspect_ids, doc_event_aspects_confidence, 
        doc_ad_aspect_ids, doc_ad_aspects_confidence, 
        aspect_docs_counts):
    if doc_event_aspect_ids is None or len(doc_event_aspect_ids) == 0 \
            or doc_ad_aspect_ids is None or len(doc_ad_aspect_ids) == 0:
        return None, None
        
    doc_event_aspects = dict(zip(doc_event_aspect_ids, doc_event_aspects_confidence))
    doc_event_aspects_tfidf_confid = {}
    for key in doc_event_aspect_ids:
        tf = 1.0
        idf = math.log(math.log(documents_total / float(aspect_docs_counts[key])))
        confidence = doc_event_aspects[key]
        doc_event_aspects_tfidf_confid[key] = tf*idf * confidence
        
    doc_ad_aspects = dict(zip(doc_ad_aspect_ids, doc_ad_aspects_confidence))
    doc_ad_aspects_tfidf_confid = {}
    for key in doc_ad_aspect_ids:
        tf = 1.0
        idf = math.log(math.log(documents_total / float(aspect_docs_counts[key])))
        confidence = doc_ad_aspects[key]
        doc_ad_aspects_tfidf_confid[key] = tf*idf * confidence
        
    similarity, intersections = cosine_similarity_dicts(doc_event_aspects_tfidf_confid, doc_ad_aspects_tfidf_confid)
    
    if intersections > 0:
        #P(A intersect B)_intersections = P(A)^intersections * P(B)^intersections
        random_error = math.pow(len(doc_event_aspect_ids) / float(len(aspect_docs_counts)), 
            intersections) * math.pow(len(doc_ad_aspect_ids) / float(len(aspect_docs_counts)), 
            intersections)
    else:
        #P(A not intersect B) = 1 - P(A intersect B)
        random_error = 1 - ((len(doc_event_aspect_ids) / float(len(aspect_docs_counts))) * 
          (len(doc_ad_aspect_ids) / float(len(aspect_docs_counts))))
    
    confidence = 1.0 - random_error    
    
    return similarity, confidence

In [176]:
def get_user_cb_interest_score(user_views_count, user_categories, user_topics, user_entities, 
        timestamp_event, category_ids_by_doc, cat_confidence_level_by_doc, 
        topic_ids_by_doc, top_confidence_level_by_doc,
        entity_ids_by_doc, ent_confidence_level_by_doc, 
        output_detailed_list=False):

    #Content-Based
    
    sims = []
    
    categories_similarity, cat_sim_confidence = cosine_similarity_user_docs_aspects(user_categories, category_ids_by_doc, cat_confidence_level_by_doc, categories_docs_counts)
    if categories_similarity != None:
        sims.append(('user_doc_ad_sim_categories', categories_similarity, cat_sim_confidence))
    
    topics_similarity, top_sim_confidence = cosine_similarity_user_docs_aspects(user_topics, topic_ids_by_doc, top_confidence_level_by_doc, topics_docs_counts)
    if topics_similarity != None:
        sims.append(('user_doc_ad_sim_topics', topics_similarity, top_sim_confidence))
    
    entities_similarity, entity_sim_confid = cosine_similarity_user_docs_aspects(user_entities, entity_ids_by_doc, ent_confidence_level_by_doc, entities_docs_counts)
    if entities_similarity != None:
        sims.append(('user_doc_ad_sim_entities', entities_similarity, entity_sim_confid))
    
    if output_detailed_list:
        return sims
    else:
        if len(sims) > 0:
            weighted_avg_sim_by_confidence = sum(map(lambda x: x[1]*x[2], sims)) / float(sum(map(lambda x: x[2], sims)))
            confidence = sum(map(lambda x: x[2], sims)) / float(len(sims))

            #print("[get_user_cb_interest_score] sims: {} | Avg: {} - Confid: {}".format(sims, weighted_avg_sim_by_confidence, confidence))
            return weighted_avg_sim_by_confidence, confidence
        else:
            return None, None

In [177]:
def get_doc_event_doc_ad_cb_similarity_score(doc_event_category_ids, doc_event_cat_confidence_levels, 
        doc_event_topic_ids, doc_event_top_confidence_levels,
        doc_event_entity_ids, doc_event_ent_confidence_levels, 
        doc_ad_category_ids, doc_ad_cat_confidence_levels, 
        doc_ad_topic_ids, doc_ad_top_confidence_levels,
        doc_ad_entity_ids, doc_ad_ent_confidence_levels,
        output_detailed_list=False):

    #Content-Based
    sims = []
    
    
    
    categories_similarity, cat_sim_confidence = cosine_similarity_doc_event_doc_ad_aspects(
        doc_event_category_ids, doc_event_cat_confidence_levels, 
        doc_ad_category_ids, doc_ad_cat_confidence_levels, 
        categories_docs_counts)
    if categories_similarity != None:
        sims.append(('doc_event_doc_ad_sim_categories', categories_similarity, cat_sim_confidence))
    
    topics_similarity, top_sim_confidence = cosine_similarity_doc_event_doc_ad_aspects(
        doc_event_topic_ids, doc_event_top_confidence_levels, 
        doc_ad_topic_ids, doc_ad_top_confidence_levels, 
        topics_docs_counts)
    
    if topics_similarity != None:
        sims.append(('doc_event_doc_ad_sim_topics', topics_similarity, top_sim_confidence))
        
    entities_similarity, entity_sim_confid = cosine_similarity_doc_event_doc_ad_aspects(
        doc_event_entity_ids, doc_event_ent_confidence_levels, 
        doc_ad_entity_ids, doc_ad_ent_confidence_levels, 
        entities_docs_counts)
    
    if entities_similarity != None:
        sims.append(('doc_event_doc_ad_sim_entities', entities_similarity, entity_sim_confid))
    
    if output_detailed_list:
        return sims
    else:
        if len(sims) > 0:
            weighted_avg_sim_by_confidence = sum(map(lambda x: x[1]*x[2], sims)) / float(sum(map(lambda x: x[2], sims)))
            confidence = sum(map(lambda x: x[2], sims)) / float(len(sims))

            #print("[get_user_cb_interest_score] sims: {} | Avg: {} - Confid: {}".format(sims, weighted_avg_sim_by_confidence, confidence))
            return weighted_avg_sim_by_confidence, confidence
        else:
            return None, None


# Feature Vector export

In [178]:
bool_feature_names = ['event_weekend',
                      'user_has_already_viewed_doc']

In [179]:
int_feature_names = ['user_views',
                    'ad_views',
                    'doc_views',
                    'doc_event_days_since_published',
                    'doc_event_hour',
                    'doc_ad_days_since_published', 
                    ]

In [180]:
float_feature_names = [                                
                'pop_ad_id',       
                'pop_ad_id_conf',   
                'pop_ad_id_conf_multipl', 
                'pop_document_id',                
                'pop_document_id_conf',
                'pop_document_id_conf_multipl',
                'pop_publisher_id',
                'pop_publisher_id_conf',
                'pop_publisher_id_conf_multipl',
                'pop_advertiser_id',
                'pop_advertiser_id_conf',
                'pop_advertiser_id_conf_multipl',
                'pop_campain_id',
                'pop_campain_id_conf',
                'pop_campain_id_conf_multipl',
                'pop_doc_event_doc_ad',
                'pop_doc_event_doc_ad_conf',
                'pop_doc_event_doc_ad_conf_multipl',
                'pop_source_id',  
                'pop_source_id_conf',
                'pop_source_id_conf_multipl',
                'pop_source_id_country',
                'pop_source_id_country_conf',
                'pop_source_id_country_conf_multipl',
                'pop_entity_id',    
                'pop_entity_id_conf',
                'pop_entity_id_conf_multipl',
                'pop_entity_id_country',
                'pop_entity_id_country_conf',
                'pop_entity_id_country_conf_multipl',
                'pop_topic_id', 
                'pop_topic_id_conf',
                'pop_topic_id_conf_multipl',
                'pop_topic_id_country',
                'pop_topic_id_country_conf',
                'pop_topic_id_country_conf_multipl',
                'pop_category_id', 
                'pop_category_id_conf',
                'pop_category_id_conf_multipl',
                'pop_category_id_country',
                'pop_category_id_country_conf',
                'pop_category_id_country_conf_multipl',
                'user_doc_ad_sim_categories',    
                'user_doc_ad_sim_categories_conf',
                'user_doc_ad_sim_categories_conf_multipl',
                'user_doc_ad_sim_topics',    
                'user_doc_ad_sim_topics_conf',
                'user_doc_ad_sim_topics_conf_multipl',
                'user_doc_ad_sim_entities',                    
                'user_doc_ad_sim_entities_conf',
                'user_doc_ad_sim_entities_conf_multipl',
                'doc_event_doc_ad_sim_categories',    
                'doc_event_doc_ad_sim_categories_conf',
                'doc_event_doc_ad_sim_categories_conf_multipl',
                'doc_event_doc_ad_sim_topics',    
                'doc_event_doc_ad_sim_topics_conf',
                'doc_event_doc_ad_sim_topics_conf_multipl',
                'doc_event_doc_ad_sim_entities',                    
                'doc_event_doc_ad_sim_entities_conf',
                'doc_event_doc_ad_sim_entities_conf_multipl'
               ]

In [181]:
TRAFFIC_SOURCE_FV='traffic_source'
EVENT_HOUR_FV='event_hour'
EVENT_COUNTRY_FV = 'event_country'
EVENT_COUNTRY_STATE_FV = 'event_country_state'
EVENT_GEO_LOCATION_FV = 'event_geo_location'
EVENT_PLATFORM_FV = 'event_platform'
AD_ADVERTISER_FV = 'ad_advertiser'
DOC_AD_SOURCE_ID_FV='doc_ad_source_id'
DOC_AD_PUBLISHER_ID_FV='doc_ad_publisher_id'
DOC_EVENT_SOURCE_ID_FV='doc_event_source_id'
DOC_EVENT_PUBLISHER_ID_FV='doc_event_publisher_id'
DOC_AD_CATEGORY_ID_FV='doc_ad_category_id'
DOC_AD_TOPIC_ID_FV='doc_ad_topic_id'
DOC_AD_ENTITY_ID_FV='doc_ad_entity_id'
DOC_EVENT_CATEGORY_ID_FV='doc_event_category_id'
DOC_EVENT_TOPIC_ID_FV='doc_event_topic_id'
DOC_EVENT_ENTITY_ID_FV='doc_event_entity_id'

### Configuring feature vector

In [182]:
category_feature_names_integral = ['ad_advertiser',
 'doc_ad_category_id_1',
 'doc_ad_category_id_2',
 'doc_ad_category_id_3',
 'doc_ad_topic_id_1',
 'doc_ad_topic_id_2',
 'doc_ad_topic_id_3',
 'doc_ad_entity_id_1', 
 'doc_ad_entity_id_2', 
 'doc_ad_entity_id_3', 
 'doc_ad_entity_id_4', 
 'doc_ad_entity_id_5', 
 'doc_ad_entity_id_6', 
 'doc_ad_publisher_id',
 'doc_ad_source_id', 
 'doc_event_category_id_1',
 'doc_event_category_id_2',
 'doc_event_category_id_3',
 'doc_event_topic_id_1',
 'doc_event_topic_id_2',
 'doc_event_topic_id_3',
 'doc_event_entity_id_1',
 'doc_event_entity_id_2',
 'doc_event_entity_id_3',
 'doc_event_entity_id_4',
 'doc_event_entity_id_5',
 'doc_event_entity_id_6',
 'doc_event_publisher_id',
 'doc_event_source_id', 
 'event_country',
 'event_country_state',
 'event_geo_location',
 'event_hour',
 'event_platform',
 'traffic_source']


In [183]:
feature_vector_labels_integral = bool_feature_names + int_feature_names + float_feature_names + category_feature_names_integral


In [184]:
feature_vector_labels_integral_dict = dict([(key, idx) for idx, key in enumerate(feature_vector_labels_integral)])


In [185]:
with open('../data/outbrain/preprocessed/'+'feature_vector_labels_integral.txt', 'w') as output:
    output.writelines('\n'.join(feature_vector_labels_integral))

In [186]:
def set_feature_vector_cat_value(field_name, field_value, feature_vector):
    if not is_null(field_value) and str(field_value) != '-1':
        feature_name = get_ohe_feature_name(field_name, field_value)
        if feature_name in feature_vector_labels_dict:
            feature_idx = feature_vector_labels_dict[feature_name]
        else:
            #Unpopular category value
            feature_idx = feature_vector_labels_dict[get_ohe_feature_name(field_name, LESS_SPECIAL_CAT_VALUE)]
            
        feature_vector[feature_idx] = float(1)
    

In [187]:
def set_feature_vector_cat_values(field_name, field_values, feature_vector):
    for field_value in field_values:
        set_feature_vector_cat_value(field_name, field_value, feature_vector)


In [188]:
def get_ad_feature_vector(user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
        event_country, event_country_state,
        ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
        geo_location_event, 
        doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
        traffic_source_pv, advertiser_id, publisher_id,
        campaign_id, document_id_event,
        doc_ad_category_ids, doc_ad_cat_confidence_levels, 
        doc_ad_topic_ids, doc_ad_top_confidence_levels,
        doc_ad_entity_ids, doc_ad_ent_confidence_levels,
        doc_event_category_ids, doc_event_cat_confidence_levels,
        doc_event_topic_ids, doc_event_top_confidence_levels,
        doc_event_entity_ids, doc_event_ent_confidence_levels):
             
    try:

        feature_vector = {}
        
        if user_views_count != None:
            feature_vector[feature_vector_labels_dict['user_views']] = float(user_views_count)
         
        if user_doc_ids_viewed != None:
            feature_vector[feature_vector_labels_dict['user_has_already_viewed_doc']] = float(document_id in user_doc_ids_viewed)               
          
        if ad_id in ad_id_popularity_broad.value:            
            feature_vector[feature_vector_labels_dict['ad_views']] = float(ad_id_popularity_broad.value[ad_id][1])
        
        if document_id in document_id_popularity_broad.value:
            feature_vector[feature_vector_labels_dict['doc_views']] = float(document_id_popularity_broad.value[document_id][1])            
            
        if timestamp_event > -1:
            dt_timestamp_event = convert_odd_timestamp(timestamp_event)
            if doc_ad_publish_time != None:
                delta_days = (dt_timestamp_event - doc_ad_publish_time).days
                if delta_days >= 0 and delta_days <= 365*10: #10 years
                    feature_vector[feature_vector_labels_dict['doc_ad_days_since_published']] = float(delta_days)
                        
            if doc_event_publish_time != None:
                delta_days = (dt_timestamp_event - doc_event_publish_time).days
                if delta_days >= 0 and delta_days <= 365*10: #10 years
                    feature_vector[feature_vector_labels_dict['doc_event_days_since_published']] = float(delta_days)
                    
            
            #Local period of the day (hours)
            dt_local_timestamp_event = get_local_datetime(dt_timestamp_event, event_country, event_country_state)    
            local_hour_bin = get_hour_bin(dt_local_timestamp_event.hour)            
            feature_vector[feature_vector_labels_dict['doc_event_hour']] = float(local_hour_bin) #Hour for Decision Trees
            set_feature_vector_cat_value(EVENT_HOUR_FV, local_hour_bin, feature_vector) #Period of day for FFM
            
            #Weekend
            weekend = int(is_weekend(dt_local_timestamp_event))
            feature_vector[feature_vector_labels_dict['event_weekend']] = float(weekend)                                                      
        
        conf_field_suffix = '_conf'
        conf_multiplied_field_suffix = '_conf_multipl'
        
        #Setting Popularity fields
        pop_scores = get_popularity_score(event_country, ad_id, document_id, source_id, 
            publisher_id, advertiser_id, campaign_id, document_id_event,
            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
            doc_ad_topic_ids, doc_ad_top_confidence_levels,
            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
            output_detailed_list=True)
        
                                

        for score in pop_scores:
            feature_vector[feature_vector_labels_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]

        #Setting User-Doc_ad CB Similarity fields
        user_doc_ad_cb_sim_scores = get_user_cb_interest_score(user_views_count, user_categories, user_topics, user_entities, 
            timestamp_event, 
            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
            doc_ad_topic_ids, doc_ad_top_confidence_levels,
            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
            output_detailed_list=True)

        for score in user_doc_ad_cb_sim_scores:
            feature_vector[feature_vector_labels_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]
            
        #Setting Doc_event-doc_ad CB Similarity fields
        doc_event_doc_ad_cb_sim_scores = get_doc_event_doc_ad_cb_similarity_score(
            doc_event_category_ids, doc_event_cat_confidence_levels,
            doc_event_topic_ids, doc_event_top_confidence_levels,
            doc_event_entity_ids, doc_event_ent_confidence_levels,
            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
            doc_ad_topic_ids, doc_ad_top_confidence_levels,
            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
            output_detailed_list=True)
        
        for score in doc_event_doc_ad_cb_sim_scores:
            feature_vector[feature_vector_labels_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]
            
        # -1 to traffic_source and platform_event
        if traffic_source_pv != None:
            feature_vector[feature_vector_labels_dict[TRAFFIC_SOURCE_FV]] = int(traffic_source_pv - 1)
        if platform_event != None:
            feature_vector[feature_vector_labels_dict[EVENT_PLATFORM_FV]] = int(platform_event - 1)
        
        # set_feature_vector_cat_value(TRAFFIC_SOURCE_FV, traffic_source_pv, feature_vector)
        set_feature_vector_cat_value(EVENT_COUNTRY_FV, event_country, feature_vector)
        set_feature_vector_cat_value(EVENT_COUNTRY_STATE_FV, event_country_state, feature_vector)         
        set_feature_vector_cat_value(EVENT_GEO_LOCATION_FV, geo_location_event, feature_vector)
        # set_feature_vector_cat_value(EVENT_PLATFORM_FV, platform_event, feature_vector)
        set_feature_vector_cat_value(AD_ADVERTISER_FV, advertiser_id, feature_vector)
        set_feature_vector_cat_value(DOC_AD_SOURCE_ID_FV, source_id, feature_vector)
        set_feature_vector_cat_value(DOC_AD_PUBLISHER_ID_FV, publisher_id, feature_vector)
        set_feature_vector_cat_value(DOC_EVENT_SOURCE_ID_FV, doc_event_source_id, feature_vector)
        set_feature_vector_cat_value(DOC_EVENT_PUBLISHER_ID_FV, doc_event_publisher_id, feature_vector)
        set_feature_vector_cat_values(DOC_AD_CATEGORY_ID_FV, doc_ad_category_ids, feature_vector)
        set_feature_vector_cat_values(DOC_AD_TOPIC_ID_FV, doc_ad_topic_ids, feature_vector)
        set_feature_vector_cat_values(DOC_AD_ENTITY_ID_FV, doc_ad_entity_ids, feature_vector)
        set_feature_vector_cat_values(DOC_EVENT_CATEGORY_ID_FV, doc_event_category_ids, feature_vector)
        set_feature_vector_cat_values(DOC_EVENT_TOPIC_ID_FV, doc_event_topic_ids, feature_vector)
        set_feature_vector_cat_values(DOC_EVENT_ENTITY_ID_FV, doc_event_entity_ids, feature_vector)
        
        #Creating dummy column as the last column because xgboost have a problem if the last column is undefined for all rows, 
        #saying that dimentions of data and feature_names do not match
        #feature_vector[feature_vector_labels_dict[DUMMY_FEATURE_COLUMN]] = float(0)
            
        #Ensuring that all elements are floats for compatibility with UDF output (ArrayType(FloatType()))
        #feature_vector = list([float(x) for x in feature_vector])
        
    except Exception as e:
        raise Exception("[get_ad_feature_vector] ERROR PROCESSING FEATURE VECTOR! Params: {}"
            .format([user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
                event_country, event_country_state,
                ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
                geo_location_event, 
                doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
                traffic_source_pv, advertiser_id, publisher_id,
                campaign_id, document_id_event,
                doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                doc_ad_topic_ids, doc_ad_top_confidence_levels,
                doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                doc_event_category_ids, doc_event_cat_confidence_levels,
                doc_event_topic_ids, doc_event_top_confidence_levels,
                doc_event_entity_ids, doc_event_ent_confidence_levels]),
            e)
    
    return SparseVector(len(feature_vector_labels_dict), feature_vector)


In [189]:
get_ad_feature_vector_udf = F.udf(
    lambda user_doc_ids_viewed, user_views_count, user_categories, user_topics, 
            user_entities, event_country, event_country_state, ad_id, document_id, 
            source_id, doc_ad_publish_time, timestamp_event, platform_event,
            geo_location_event, 
            doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
            traffic_source_pv, advertiser_id, publisher_id,
            campaign_id, document_id_event,
            category_ids_by_doc, cat_confidence_level_by_doc, 
            topic_ids_by_doc, top_confidence_level_by_doc,
            entity_ids_by_doc, ent_confidence_level_by_doc,
            doc_event_category_id_list, doc_event_confidence_level_cat_list,
            doc_event_topic_id_list, doc_event_confidence_level_top,
            doc_event_entity_id_list, doc_event_confidence_level_ent: \
        get_ad_feature_vector(user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
                event_country, event_country_state, 
                ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
                geo_location_event, 
                doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,   
                traffic_source_pv, advertiser_id, publisher_id,
                campaign_id, document_id_event,
                category_ids_by_doc, cat_confidence_level_by_doc, 
                topic_ids_by_doc, top_confidence_level_by_doc,
                entity_ids_by_doc, ent_confidence_level_by_doc,
                doc_event_category_id_list, doc_event_confidence_level_cat_list,
                doc_event_topic_id_list, doc_event_confidence_level_top,
                doc_event_entity_id_list, doc_event_confidence_level_ent),    
                VectorUDT())

### Building feature vectors

In [190]:
def set_feature_vector_cat_value_integral(field_name, field_value, feature_vector):
    if not is_null(field_value): #and str(field_value) != '-1':
        feature_vector[feature_vector_labels_integral_dict[field_name]] = float(field_value)
  

In [191]:
def set_feature_vector_cat_top_multi_values_integral(
        field_name, values, confidences, feature_vector, top=5):
    top_values = list(filter(lambda z: z != -1, 
            map(lambda y: y[0], sorted(zip(values, confidences), key=lambda x: -x[1]))))[:top]
    for idx, field_value in list(enumerate(top_values)):
        set_feature_vector_cat_value_integral(
            '{}_{}'.format(field_name, idx+1), field_value, feature_vector)


In [192]:
def get_ad_feature_vector_integral(
        user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
        event_country, event_country_state,
        ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
        geo_location_event, 
        doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
        traffic_source_pv, advertiser_id, publisher_id,
        campaign_id, document_id_event,
        doc_ad_category_ids, doc_ad_cat_confidence_levels, 
        doc_ad_topic_ids, doc_ad_top_confidence_levels,
        doc_ad_entity_ids, doc_ad_ent_confidence_levels,
        doc_event_category_ids, doc_event_cat_confidence_levels,
        doc_event_topic_ids, doc_event_top_confidence_levels,
        doc_event_entity_ids, doc_event_ent_confidence_levels):
       
    try:

        feature_vector = {}
        
        if user_views_count != None:
            feature_vector[feature_vector_labels_integral_dict['user_views']] = float(user_views_count)
         
        if user_doc_ids_viewed != None:
            feature_vector[feature_vector_labels_integral_dict['user_has_already_viewed_doc']] = float(document_id in user_doc_ids_viewed)               
          
        if ad_id in ad_id_popularity_broad.value: 
            feature_vector[feature_vector_labels_integral_dict['ad_views']] = float(ad_id_popularity_broad.value[ad_id][1])
        
        if document_id in document_id_popularity_broad.value:
            feature_vector[feature_vector_labels_integral_dict['doc_views']] = float(document_id_popularity_broad.value[document_id][1])            
            
        if timestamp_event > -1:
            dt_timestamp_event = convert_odd_timestamp(timestamp_event)
            if doc_ad_publish_time != None:
                delta_days = (dt_timestamp_event - doc_ad_publish_time).days
                if delta_days >= 0 and delta_days <= 365*10: #10 years
                    feature_vector[feature_vector_labels_integral_dict['doc_ad_days_since_published']] = float(delta_days)
                        
            if doc_event_publish_time != None:
                delta_days = (dt_timestamp_event - doc_event_publish_time).days
                if delta_days >= 0 and delta_days <= 365*10: #10 years
                    feature_vector[feature_vector_labels_integral_dict['doc_event_days_since_published']] = float(delta_days)
                    
            
            #Local period of the day (hours)
            dt_local_timestamp_event = get_local_datetime(dt_timestamp_event, event_country, event_country_state)    
            local_hour_bin = get_hour_bin(dt_local_timestamp_event.hour)            
            feature_vector[feature_vector_labels_integral_dict['doc_event_hour']] = float(local_hour_bin) #Hour for Decision Trees
            set_feature_vector_cat_value_integral(EVENT_HOUR_FV, local_hour_bin, feature_vector) #Period of day for FFM
            
            #Weekend
            weekend = int(is_weekend(dt_local_timestamp_event))
            feature_vector[feature_vector_labels_integral_dict['event_weekend']] = float(weekend)               
                                        
        
        conf_field_suffix = '_conf'
        conf_multiplied_field_suffix = '_conf_multipl'
        
        #Setting Popularity fields
        pop_scores = get_popularity_score(event_country, ad_id, document_id, source_id, 
            publisher_id, advertiser_id, campaign_id, document_id_event,
            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
            doc_ad_topic_ids, doc_ad_top_confidence_levels,
            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
            output_detailed_list=True)
        
                                

        for score in pop_scores:
            feature_vector[feature_vector_labels_integral_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]

        #Setting User-Doc_ad CB Similarity fields
        user_doc_ad_cb_sim_scores = get_user_cb_interest_score(
            user_views_count, user_categories, user_topics, user_entities, 
            timestamp_event, 
            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
            doc_ad_topic_ids, doc_ad_top_confidence_levels,
            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
            output_detailed_list=True)

        for score in user_doc_ad_cb_sim_scores:
            feature_vector[feature_vector_labels_integral_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]
            
        #Setting Doc_event-doc_ad CB Similarity fields
        doc_event_doc_ad_cb_sim_scores = get_doc_event_doc_ad_cb_similarity_score(
            doc_event_category_ids, doc_event_cat_confidence_levels,
            doc_event_topic_ids, doc_event_top_confidence_levels,
            doc_event_entity_ids, doc_event_ent_confidence_levels,
            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
            doc_ad_topic_ids, doc_ad_top_confidence_levels,
            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
            output_detailed_list=True)
        
        for score in doc_event_doc_ad_cb_sim_scores:
            feature_vector[feature_vector_labels_integral_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]
        
        #Process code for event_country
        if event_country in event_country_values_counts:
            event_country_code = event_country_values_counts[event_country]
        else:
            event_country_code = event_country_values_counts[LESS_SPECIAL_CAT_VALUE]                        
        set_feature_vector_cat_value_integral(EVENT_COUNTRY_FV, event_country_code, feature_vector)
        
        #Process code for event_country_state
        if event_country_state in event_country_state_values_counts:
            event_country_state_code = event_country_state_values_counts[event_country_state]
        else:
            event_country_state_code = event_country_state_values_counts[LESS_SPECIAL_CAT_VALUE]         
        set_feature_vector_cat_value_integral(EVENT_COUNTRY_STATE_FV, event_country_state_code, feature_vector)
                
        #Process code for geo_location_event
        if geo_location_event in event_geo_location_values_counts:
            geo_location_event_code = event_geo_location_values_counts[geo_location_event]
        else:
            geo_location_event_code = event_geo_location_values_counts[LESS_SPECIAL_CAT_VALUE]
        
        # -1 to traffic_source and platform_event
        if traffic_source_pv != None:
            feature_vector[feature_vector_labels_integral_dict[TRAFFIC_SOURCE_FV]] = int(traffic_source_pv - 1)
        if platform_event != None:
            feature_vector[feature_vector_labels_integral_dict[EVENT_PLATFORM_FV]] = int(platform_event - 1)
        
        set_feature_vector_cat_value_integral(EVENT_GEO_LOCATION_FV, geo_location_event_code, feature_vector)   
         
        # set_feature_vector_cat_value_integral(TRAFFIC_SOURCE_FV, traffic_source_pv - 1, feature_vector)        
        # set_feature_vector_cat_value_integral(EVENT_PLATFORM_FV, platform_event - 1, feature_vector)
        set_feature_vector_cat_value_integral(AD_ADVERTISER_FV, advertiser_id, feature_vector)
        set_feature_vector_cat_value_integral(DOC_AD_SOURCE_ID_FV, source_id, feature_vector)
        set_feature_vector_cat_value_integral(DOC_AD_PUBLISHER_ID_FV, publisher_id, feature_vector)
        set_feature_vector_cat_value_integral(DOC_EVENT_SOURCE_ID_FV, doc_event_source_id, feature_vector)
        set_feature_vector_cat_value_integral(DOC_EVENT_PUBLISHER_ID_FV, doc_event_publisher_id, feature_vector)
                
        set_feature_vector_cat_top_multi_values_integral(DOC_AD_CATEGORY_ID_FV, doc_ad_category_ids, doc_ad_cat_confidence_levels, feature_vector, top=3)
        set_feature_vector_cat_top_multi_values_integral(DOC_AD_TOPIC_ID_FV, doc_ad_topic_ids, doc_ad_top_confidence_levels, feature_vector, top=3)
        
        set_feature_vector_cat_top_multi_values_integral(DOC_EVENT_CATEGORY_ID_FV, doc_event_category_ids, doc_event_cat_confidence_levels, feature_vector, top=3)
        set_feature_vector_cat_top_multi_values_integral(DOC_EVENT_TOPIC_ID_FV, doc_event_topic_ids, doc_event_top_confidence_levels, feature_vector, top=3)                           
        
        #Process codes for doc_ad_entity_ids
        doc_ad_entity_ids_codes = [doc_entity_id_values_counts[x] 
            if x in doc_entity_id_values_counts 
            else doc_entity_id_values_counts[LESS_SPECIAL_CAT_VALUE] 
            for x in doc_ad_entity_ids]
        set_feature_vector_cat_top_multi_values_integral(DOC_AD_ENTITY_ID_FV, doc_ad_entity_ids_codes, doc_ad_ent_confidence_levels, feature_vector, top=6)
        
        
        #Process codes for doc_event_entity_ids
        doc_event_entity_ids_codes = [doc_entity_id_values_counts[x] 
            if x in doc_entity_id_values_counts 
            else doc_entity_id_values_counts[LESS_SPECIAL_CAT_VALUE] 
            for x in doc_event_entity_ids]
        set_feature_vector_cat_top_multi_values_integral(DOC_EVENT_ENTITY_ID_FV, doc_event_entity_ids_codes, doc_event_ent_confidence_levels, feature_vector, top=6)
        
        #Creating dummy column as the last column because xgboost have a problem if the last column is undefined for all rows, 
        #saying that dimentions of data and feature_names do not match
        #feature_vector[feature_vector_labels_dict[DUMMY_FEATURE_COLUMN]] = float(0)
            
        #Ensuring that all elements are floats for compatibility with UDF output (ArrayType(FloatType()))
        #feature_vector = list([float(x) for x in feature_vector])
      
    except Exception as e:
        raise Exception("[get_ad_feature_vector_integral] ERROR PROCESSING FEATURE VECTOR! Params: {}" \
           .format([user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
             event_country, event_country_state,
             ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
             geo_location_event, 
             doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
             traffic_source_pv, advertiser_id, publisher_id,
             campaign_id, document_id_event,
             doc_ad_category_ids, doc_ad_cat_confidence_levels, 
             doc_ad_topic_ids, doc_ad_top_confidence_levels,
             doc_ad_entity_ids, doc_ad_ent_confidence_levels,
             doc_event_category_ids, doc_event_cat_confidence_levels,
             doc_event_topic_ids, doc_event_top_confidence_levels,
             doc_event_entity_ids, doc_event_ent_confidence_levels]),
         e)
  
    return SparseVector(len(feature_vector_labels_integral_dict), feature_vector)


In [193]:
get_ad_feature_vector_integral_udf = F.udf(
    lambda user_doc_ids_viewed, user_views_count, user_categories, user_topics, 
            user_entities, event_country, event_country_state, ad_id, document_id, source_id, 
            doc_ad_publish_time, timestamp_event, platform_event,
            geo_location_event, 
            doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
            traffic_source_pv, advertiser_id, publisher_id,
            campaign_id, document_id_event,
            category_ids_by_doc, cat_confidence_level_by_doc, 
            topic_ids_by_doc, top_confidence_level_by_doc,
            entity_ids_by_doc, ent_confidence_level_by_doc,
            doc_event_category_id_list, doc_event_confidence_level_cat_list,
            doc_event_topic_id_list, doc_event_confidence_level_top,
            doc_event_entity_id_list, doc_event_confidence_level_ent: \
        get_ad_feature_vector_integral(user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
                event_country, event_country_state, 
                ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
                geo_location_event, 
                doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,   
                traffic_source_pv, advertiser_id, publisher_id,
                campaign_id, document_id_event,
                category_ids_by_doc, cat_confidence_level_by_doc, 
                topic_ids_by_doc, top_confidence_level_by_doc,
                entity_ids_by_doc, ent_confidence_level_by_doc,
                doc_event_category_id_list, doc_event_confidence_level_cat_list,
                doc_event_topic_id_list, doc_event_confidence_level_top,
                doc_event_entity_id_list, doc_event_confidence_level_ent),    
            VectorUDT())

## Export Train set feature vectors

In [194]:
train_set_enriched_df = train_set_df \
  .join(documents_categories_grouped_df, 
    on=F.col("document_id_promo") == F.col("documents_categories_grouped.document_id_cat"), 
    how='left') \
  .join(documents_topics_grouped_df, 
    on=F.col("document_id_promo") == F.col("documents_topics_grouped.document_id_top"), 
    how='left') \
  .join(documents_entities_grouped_df, 
    on=F.col("document_id_promo") == F.col("documents_entities_grouped.document_id_ent"), 
    how='left') \
  .join(documents_categories_grouped_df \
      .withColumnRenamed('category_id_list', 'doc_event_category_id_list')
      .withColumnRenamed('confidence_level_cat_list', 'doc_event_confidence_level_cat_list') \
      .alias('documents_event_categories_grouped'), 
    on=F.col("document_id_event") == F.col("documents_event_categories_grouped.document_id_cat"), 
    how='left') \
  .join(documents_topics_grouped_df \
      .withColumnRenamed('topic_id_list', 'doc_event_topic_id_list')
      .withColumnRenamed('confidence_level_top_list', 'doc_event_confidence_level_top_list') \
      .alias('documents_event_topics_grouped'), 
    on=F.col("document_id_event") == F.col("documents_event_topics_grouped.document_id_top"), 
    how='left') \
  .join(documents_entities_grouped_df \
      .withColumnRenamed('entity_id_list', 'doc_event_entity_id_list')
      .withColumnRenamed('confidence_level_ent_list', 'doc_event_confidence_level_ent_list') \
      .alias('documents_event_entities_grouped'), 
    on=F.col("document_id_event") == F.col("documents_event_entities_grouped.document_id_ent"), 
    how='left') \
  .select('display_id','uuid_event','event_country','event_country_state','platform_event',
      'source_id_doc_event', 'publisher_doc_event','publish_time_doc_event',
      'publish_time', 'ad_id','document_id_promo','clicked',   
      'geo_location_event', 'advertiser_id', 'publisher_id',
      'campaign_id', 'document_id_event',
      'traffic_source_pv',                                          
      int_list_null_to_empty_list_udf('doc_event_category_id_list') 
        .alias('doc_event_category_id_list'),
      float_list_null_to_empty_list_udf('doc_event_confidence_level_cat_list')
        .alias('doc_event_confidence_level_cat_list'),
      int_list_null_to_empty_list_udf('doc_event_topic_id_list')
        .alias('doc_event_topic_id_list'),
      float_list_null_to_empty_list_udf('doc_event_confidence_level_top_list')
        .alias('doc_event_confidence_level_top_list'),
      str_list_null_to_empty_list_udf('doc_event_entity_id_list')
        .alias('doc_event_entity_id_list'),
      float_list_null_to_empty_list_udf('doc_event_confidence_level_ent_list')
        .alias('doc_event_confidence_level_ent_list'),
      int_null_to_minus_one_udf('source_id').alias('source_id'),
      int_null_to_minus_one_udf('timestamp_event').alias('timestamp_event'),
      int_list_null_to_empty_list_udf('category_id_list').alias('category_id_list'), 
      float_list_null_to_empty_list_udf('confidence_level_cat_list')
        .alias('confidence_level_cat_list'), 
      int_list_null_to_empty_list_udf('topic_id_list').alias('topic_id_list'), 
      float_list_null_to_empty_list_udf('confidence_level_top_list')
        .alias('confidence_level_top_list'), 
      str_list_null_to_empty_list_udf('entity_id_list').alias('entity_id_list'), 
      float_list_null_to_empty_list_udf('confidence_level_ent_list')
        .alias('confidence_level_ent_list')) \
  .join(user_profiles_df, on=[F.col("user_profiles.uuid") == F.col("uuid_event")], how='left') \
  .withColumnRenamed('categories', 'user_categories') \
  .withColumnRenamed('topics', 'user_topics') \
  .withColumnRenamed('entities', 'user_entities') \
  .withColumnRenamed('doc_ids', 'user_doc_ids_viewed') \
  .withColumnRenamed('views', 'user_views_count')


In [222]:
train_set_enriched_df.take(2)

[Row(display_id=8264198, uuid_event='10015959b777b6', event_country='US', event_country_state='US', platform_event=2, source_id_doc_event=482, publisher_doc_event=65, publish_time_doc_event=datetime.datetime(2014, 8, 5, 16, 0), publish_time=datetime.datetime(2014, 1, 28, 16, 0), ad_id=123742, document_id_promo=1148731, clicked=0, geo_location_event='US', advertiser_id=571, publisher_id=523, campaign_id=15889, document_id_event=786004, traffic_source_pv=1, doc_event_category_id_list=[1403, 1402], doc_event_confidence_level_cat_list=[0.8077853918075562, 0.061461932957172394], doc_event_topic_id_list=[82], doc_event_confidence_level_top_list=[0.25338107347488403], doc_event_entity_id_list=[], doc_event_confidence_level_ent_list=[], source_id=478, timestamp_event=566415049, category_id_list=[1504, 1503], confidence_level_cat_list=[0.8669483661651611, 0.06596346199512482], topic_id_list=[285], confidence_level_top_list=[0.16625766456127167], entity_id_list=['a9dc59dee4759f637b1cfe64e551c9a3

In [223]:
train_set_feature_vectors_df = train_set_enriched_df \
  .withColumn('feature_vector', 
    get_ad_feature_vector_integral_udf(
      'user_doc_ids_viewed',
      'user_views_count',
      'user_categories', 
      'user_topics', 
      'user_entities', 
      'event_country', 
      'event_country_state',
      'ad_id', 
      'document_id_promo', 
      'source_id', 
      'publish_time', 
      'timestamp_event', 
      'platform_event',
      'geo_location_event', 
      'source_id_doc_event', 
      'publisher_doc_event',
      'publish_time_doc_event',
      'traffic_source_pv',
      'advertiser_id', 
      'publisher_id',
      'campaign_id',
      'document_id_event',
      'category_id_list', 
      'confidence_level_cat_list', 
      'topic_id_list', 
      'confidence_level_top_list',
      'entity_id_list', 
      'confidence_level_ent_list',
      'doc_event_category_id_list',
      'doc_event_confidence_level_cat_list',
      'doc_event_topic_id_list',
      'doc_event_confidence_level_top_list',
      'doc_event_entity_id_list',
      'doc_event_confidence_level_ent_list')) \
  .select(F.col('uuid_event').alias('uuid'), 'display_id', 'ad_id', 'document_id_event',
    F.col('document_id_promo').alias('document_id'), F.col('clicked').alias('label'),
    'feature_vector') 


In [224]:
train_set_feature_vectors_df.take(2)

[Row(uuid='10015959b777b6', display_id=8264198, ad_id=130952, document_id_event=786004, document_id=1286844, label=1, feature_vector=SparseVector(103, {0: 0.0, 3: 91964.0, 4: 110688.0, 5: 685.0, 6: 5.0, 8: 0.2624, 9: 0.6881, 10: 0.1806, 11: 0.2484, 12: 0.567, 13: 0.1409, 14: 0.2363, 15: 0.5443, 16: 0.1286, 17: 0.2363, 18: 0.5443, 19: 0.1286, 20: 0.244, 21: 0.555, 22: 0.1354, 26: 0.2363, 27: 0.5443, 28: 0.1286, 29: 0.2363, 30: 0.5443, 31: 0.1286, 32: 0.2484, 33: 0.0514, 34: 0.0128, 35: 0.2484, 36: 0.0514, 37: 0.0128, 38: 0.1903, 39: 0.0046, 40: 0.0009, 41: 0.192, 42: 0.0042, 43: 0.0008, 44: 0.1693, 45: 0.146, 46: 0.0247, 47: 0.164, 48: 0.1311, 49: 0.0215, 59: 0.0, 60: 0.0004, 61: 0.0, 62: 0.0, 63: 0.0001, 64: 0.0, 68: 131.0, 69: 1505.0, 70: 1515.0, 72: 143.0, 73: 131.0, 74: 138.0, 81: 1346.0, 82: 1223.0, 83: 1403.0, 84: 1402.0, 86: 82.0, 95: 65.0, 96: 482.0, 97: 18595447.0, 98: 758487.0, 99: 758487.0, 100: 5.0, 101: 1.0, 102: 0.0})),
 Row(uuid='10015959b777b6', display_id=8264198, ad_id

In [195]:
if evaluation:
    train_feature_vector_gcs_folder_name = 'train_feature_vectors_integral_eval'
else:
    train_feature_vector_gcs_folder_name = 'train_feature_vectors_integral'    


In [196]:
OUTPUT_BUCKET_FOLDER

'hdfs:/user/lzhao/data/outbrain/preprocessed/'

In [197]:
train_set_feature_vectors_df.write.parquet(OUTPUT_BUCKET_FOLDER+train_feature_vector_gcs_folder_name, mode='overwrite')


NameError: name 'train_set_feature_vectors_df' is not defined

In [198]:
def is_leak(max_timestamp_pv_leak, timestamp_event):
    return max_timestamp_pv_leak >= 0 and max_timestamp_pv_leak >= timestamp_event


In [199]:
is_leak_udf = F.udf(lambda max_timestamp_pv_leak, timestamp_event: int(is_leak(max_timestamp_pv_leak, timestamp_event)), IntegerType())


In [200]:
if evaluation:
    data_df = validation_set_df
else:
    data_df = test_set_df

In [201]:
test_validation_set_enriched_df = data_df.select(
    'display_id','uuid_event','event_country','event_country_state','platform_event',
    'source_id_doc_event', 'publisher_doc_event','publish_time_doc_event',     
    'publish_time',
    'ad_id','document_id_promo','clicked',  
    'geo_location_event', 'advertiser_id', 'publisher_id',
    'campaign_id', 'document_id_event',
    'traffic_source_pv',                                           
    int_list_null_to_empty_list_udf('doc_event_category_id_list')
      .alias('doc_event_category_id_list'),
    float_list_null_to_empty_list_udf('doc_event_confidence_level_cat_list')
      .alias('doc_event_confidence_level_cat_list'),
    int_list_null_to_empty_list_udf('doc_event_topic_id_list')
      .alias('doc_event_topic_id_list'),
    float_list_null_to_empty_list_udf('doc_event_confidence_level_top_list')
      .alias('doc_event_confidence_level_top_list'),
    str_list_null_to_empty_list_udf('doc_event_entity_id_list')
      .alias('doc_event_entity_id_list'),
    float_list_null_to_empty_list_udf('doc_event_confidence_level_ent_list')
      .alias('doc_event_confidence_level_ent_list'),
    int_null_to_minus_one_udf('source_id')
      .alias('source_id'),                                   
    int_null_to_minus_one_udf('timestamp_event').alias('timestamp_event'),
    int_list_null_to_empty_list_udf('category_id_list').alias('category_id_list'), 
    float_list_null_to_empty_list_udf('confidence_level_cat_list')
      .alias('confidence_level_cat_list'), 
    int_list_null_to_empty_list_udf('topic_id_list').alias('topic_id_list'), 
    float_list_null_to_empty_list_udf('confidence_level_top_list')
      .alias('confidence_level_top_list'), 
    str_list_null_to_empty_list_udf('entity_id_list').alias('entity_id_list'), 
    float_list_null_to_empty_list_udf('confidence_level_ent_list')
      .alias('confidence_level_ent_list'),
    int_null_to_minus_one_udf('max_timestamp_pv').alias('max_timestamp_pv_leak')) \
  .join(user_profiles_df, on=[F.col("user_profiles.uuid") == F.col("uuid_event")], how='left') \
  .withColumnRenamed('categories', 'user_categories') \
  .withColumnRenamed('topics', 'user_topics') \
  .withColumnRenamed('entities', 'user_entities') \
  .withColumnRenamed('doc_ids', 'user_doc_ids_viewed') \
  .withColumnRenamed('views', 'user_views_count')


In [202]:
test_validation_set_feature_vectors_df = test_validation_set_enriched_df \
  .withColumn('feature_vector', 
    get_ad_feature_vector_integral_udf(
      'user_doc_ids_viewed', 
      'user_views_count',
      'user_categories', 
      'user_topics', 
      'user_entities', 
      'event_country', 
      'event_country_state',
      'ad_id', 
      'document_id_promo', 
      'source_id', 
      'publish_time', 
      'timestamp_event', 
      'platform_event',
      'geo_location_event', 
      'source_id_doc_event', 
      'publisher_doc_event',
      'publish_time_doc_event',
      'traffic_source_pv',
      'advertiser_id', 
      'publisher_id',
      'campaign_id',
      'document_id_event',
      'category_id_list', 
      'confidence_level_cat_list', 
      'topic_id_list', 
      'confidence_level_top_list',
      'entity_id_list', 
      'confidence_level_ent_list',
      'doc_event_category_id_list',
      'doc_event_confidence_level_cat_list',
      'doc_event_topic_id_list',
      'doc_event_confidence_level_top_list',
      'doc_event_entity_id_list',
      'doc_event_confidence_level_ent_list')) \
  .select(F.col('uuid').alias('uuid'), 'display_id', 'ad_id', 'document_id_event',
    F.col('document_id_promo').alias('document_id'), F.col('clicked').alias('label'),
    is_leak_udf('max_timestamp_pv_leak','timestamp_event').alias('is_leak'),
    'feature_vector')

In [203]:
if evaluation:
    test_validation_feature_vector_gcs_folder_name = 'validation_feature_vectors_integral'
else:
    test_validation_feature_vector_gcs_folder_name = 'test_feature_vectors_integral'


In [204]:
test_validation_set_feature_vectors_df.write.parquet(OUTPUT_BUCKET_FOLDER+test_validation_feature_vector_gcs_folder_name, mode='overwrite')


In [205]:
spark.stop()