In [2]:
from __future__ import print_function
import argparse
import os
import sys
import math
import logging
import numpy as np
import codecs
import json
import glob
import pandas as pd
from collections import defaultdict
from io import StringIO
from operator import add
import boto3
from subprocess import check_output
from datetime import datetime
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.python.framework import ops
from tensorflow.python.ops import math_ops
from tensorflow.keras import Model, Sequential
from tensorflow.keras import Input
from tensorflow.keras import layers
from tensorflow.keras.layers import DenseFeatures
from tensorflow.keras.layers import Reshape, Flatten, Concatenate, RepeatVector, Add, Subtract, Multiply, Dot, PReLU, Softmax, Activation
from tensorflow.keras.layers import Dense, Lambda, Embedding, LocallyConnected1D, Permute, Dropout
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPool1D, LSTM, GRU
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
from tensorflow.keras.losses import MeanAbsoluteError, MeanSquaredError
from tensorflow.keras.losses import MeanSquaredLogarithmicError, MeanAbsolutePercentageError, BinaryCrossentropy
from tensorflow.keras.metrics import MAE, MSE, MSLE, MAPE


2022-07-21 14:54:04.224279: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-07-21 14:54:04.224354: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-172-31-21-248): /proc/driver/nvidia/version does not exist


In [3]:
# data pipeline function
def get_csv_dataset(dataset_path, shuffle=False, batch_size=512, drop_remainder=False, processes_per_host=-1, repeat=True):
    def preprocess_fn(dataset):
        feature_dict = {col: dataset[col] for col in FEATURE_COLS}
        dataset['label'] = tf.cast(tf.greater(dataset['Bidding_price'], dataset['Paying_price']), dtype=tf.int64)
        label_dict = {'ground_truth': tf.stack([dataset['label'], dataset['Bidding_price'], dataset['Paying_price']], axis=1)}
        return (feature_dict, label_dict)
    
    all_files = glob.glob(dataset_path)
    df_from_each_file = []
    for f in all_files:
        print(f)
        df = pd.read_csv(f, names=list(COLS.keys()), dtype=COLS, header=None, sep='\t', index_col=None)
        for col in df:
            if df[col].dtype=="object":
                df[col].fillna('',inplace=True)
            else:
                df[col].fillna(0,inplace=True)
        df_from_each_file.append(df)
    dataset_df = pd.concat(df_from_each_file, ignore_index=True)
    dataset_df = dataset_df[FEATURE_COLS + LABEL_COLS]
    dataset_df.info()
    dataset = tf.data.Dataset.from_tensor_slices(dict(dataset_df))
    
#     dataset = dataset.cache()
    if shuffle:
        dataset = dataset.shuffle(batch_size, reshuffle_each_iteration=True)
    dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
    dataset = dataset.map(preprocess_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    
    if repeat:
        dataset = dataset.repeat()
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

In [4]:
# hyper parameters
# dataset: https://contest.ipinyou.com
COLS = {
    'Bid_ID': 'object', 
    'Timestamp': 'int', 
    'Log_type': 'int', 
    'iPinYou_ID': 'object', 
    'User-Agent': 'object',
    'IP': 'object', 
    'Region': 'int', 
    'City': 'int', 
    'Ad_exchange': 'object', 
    'Domain': 'object',
    'URL': 'object', 
    'Anonymous_URL_ID': 'float', 
    'Ad_slot_ID': 'object', 
    'Ad_slot_width': 'int', 
    'Ad_slot_height': 'int',
    'Ad_slot_visibility': 'object', 
    'Ad_slot_format': 'object', 
    'Ad_slot_floor_price': 'int', 
    'Creative_ID': 'object',
    'Bidding_price': 'int', 
    'Paying_price': 'int', 
    'Key_page_URL': 'object', 
    'Advertiser_ID': 'int', 
    'User_Tags': 'object'
}

FEATURE_COLS = ['Timestamp', 'iPinYou_ID', 'User-Agent',
'IP', 'Region', 'City', 'Ad_exchange', 'Domain',
'URL', 'Ad_slot_width', 'Ad_slot_height',
'Ad_slot_visibility', 'Ad_slot_format', 'Ad_slot_floor_price', 'Creative_ID']

LABEL_COLS = ['Bidding_price', 'Paying_price']

args = {
    'train': "/home/ubuntu/Jampp/data/training2nd/*.txt.bz2",
    'validation': "/home/ubuntu/Jampp/data/validation2nd/*.txt.bz2",
    'test': "/home/ubuntu/Jampp/data/testing2nd/leaderboard.test.data.20130613_15.txt.bz2"
}

epochs = 10
batch_size = 1024
process_per_host=8
learning_rate = 0.001
l2_reg = 0.001
B_START = 0
B_LIMIT = 300
B_DELTA = 1
MAX_IDX = int((B_LIMIT - B_START) / B_DELTA - 1)

In [6]:
# Prepare training/validation/test datasets
training_dataset = get_csv_dataset(args['train'], 
                                   shuffle=True, 
                                   batch_size=batch_size,
                                   drop_remainder=False, 
                                   processes_per_host=process_per_host,
                                   repeat=True)

/home/ubuntu/Jampp/data/training2nd/imp.20130610.txt.bz2
/home/ubuntu/Jampp/data/training2nd/imp.20130608.txt.bz2
/home/ubuntu/Jampp/data/training2nd/imp.20130609.txt.bz2
/home/ubuntu/Jampp/data/training2nd/imp.20130606.txt.bz2
/home/ubuntu/Jampp/data/training2nd/imp.20130607.txt.bz2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8834027 entries, 0 to 8834026
Data columns (total 17 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   Timestamp            int64 
 1   iPinYou_ID           object
 2   User-Agent           object
 3   IP                   object
 4   Region               int64 
 5   City                 int64 
 6   Ad_exchange          object
 7   Domain               object
 8   URL                  object
 9   Ad_slot_width        int64 
 10  Ad_slot_height       int64 
 11  Ad_slot_visibility   object
 12  Ad_slot_format       object
 13  Ad_slot_floor_price  int64 
 14  Creative_ID          object
 15  Bidding_price        int64 
 16  Pay

2022-07-21 14:02:29.081382: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
validation_dataset = get_csv_dataset(args['validation'], 
                                     shuffle=False, 
                                     batch_size=batch_size,
                                     drop_remainder=False, 
                                     processes_per_host=-1,
                                     repeat=True)

/home/ubuntu/Jampp/data/validation2nd/imp.20130612.txt.bz2
/home/ubuntu/Jampp/data/validation2nd/imp.20130611.txt.bz2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3403060 entries, 0 to 3403059
Data columns (total 17 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   Timestamp            int64 
 1   iPinYou_ID           object
 2   User-Agent           object
 3   IP                   object
 4   Region               int64 
 5   City                 int64 
 6   Ad_exchange          object
 7   Domain               object
 8   URL                  object
 9   Ad_slot_width        int64 
 10  Ad_slot_height       int64 
 11  Ad_slot_visibility   object
 12  Ad_slot_format       object
 13  Ad_slot_floor_price  int64 
 14  Creative_ID          object
 15  Bidding_price        int64 
 16  Paying_price         int64 
dtypes: int64(8), object(9)
memory usage: 441.4+ MB


In [8]:
training_row_num = 10579749  # 3047704
validation_row_num = 2521630  # 110467
steps_per_epoch = math.ceil(training_row_num / batch_size)
validation_steps = math.ceil(validation_row_num / batch_size)
print("Training dataset samples = " + str(training_row_num) + ", steps = " + str(steps_per_epoch))
print("Validation dataset samples = " + str(validation_row_num) + ", steps = " + str(validation_steps))

Training dataset samples = 10579749, steps = 10332
Validation dataset samples = 2521630, steps = 2463


In [11]:
def neighbourhood_likelihood_loss(y_true, y_pred):
    # y_true = (label, bidding_price(b), winning_price(z))
    # y_pred = (price_step)
    y_pred = ops.convert_to_tensor(y_pred)        # (None_all, price_step)
    y_true = math_ops.cast(y_true, y_pred.dtype)  # (None_all, 3)

    # arg
    global B_START
    global B_LIMIT
    global B_DELTA
    price_step = tf.cast(tf.shape(y_pred)[-1], tf.int32)

    # split y_true
    y_true_label_1d = K.flatten(tf.slice(y_true, [0,0], [-1,1]))  # (None_all,)
    # caculate the bidding price bucket index
    y_true_b = tf.slice(y_true, [0,1], [-1,1])  # (None_all, 1)
    y_true_b = tf.clip_by_value(y_true_b, B_START, B_LIMIT)
    y_true_b_idx_2d = tf.cast(tf.floor((y_true_b - B_START) / B_DELTA), dtype='int32')  # (None_all, 1)
    y_true_b_idx_1d = K.flatten(y_true_b_idx_2d)  # (None_all,)
    # caculate the winning price bucket index
    y_true_z = tf.slice(y_true, [0,2], [-1,1])  # (None_all, 1)
    y_true_z = tf.clip_by_value(y_true_z, B_START, B_LIMIT)
    y_true_z_idx_2d = tf.cast(tf.floor((y_true_z - B_START) / B_DELTA), dtype='int32')  # (None_all, 1)
    y_true_z_idx_1d = K.flatten(y_true_z_idx_2d)  # (None_all,)

    # Calculate masks
    ## on All bids
    mask_win = y_true_label_1d  # (None,)
    mask_lose = 1 - mask_win  # (None,)

    mask_z_cdf = tf.sequence_mask(
                    y_true_z_idx_1d + 1, 
                    price_step)  # (None, price_step)
    mask_z_pdf = tf.math.logical_xor(
                    mask_z_cdf, 
                    tf.sequence_mask(
                        y_true_z_idx_1d,
                        price_step))  # (None, price_step)

    mask_b_cdf = tf.sequence_mask(
                    y_true_b_idx_1d + 1, 
                    price_step)  # (None, price_step)
    mask_b_pdf = tf.math.logical_xor(
                    mask_b_cdf, 
                    tf.sequence_mask(
                        y_true_b_idx_1d, 
                        price_step))  # (None, price_step)
    ## on Winning bids
    mask_win_z_cdf = tf.boolean_mask(mask_z_cdf, mask_win)  # (None_win, price_step)
    mask_win_z_pdf = tf.boolean_mask(mask_z_pdf, mask_win)  # (None_win, price_step)
    mask_win_b_cdf = tf.boolean_mask(mask_b_cdf, mask_win)  # (None_win, price_step)
    mask_win_b_pdf = tf.boolean_mask(mask_b_pdf, mask_win)  # (None_win, price_step)
    ## on Losing bids
    mask_lose_b_cdf = tf.boolean_mask(mask_z_cdf, mask_lose)  # (None_lose, price_step)
    mask_lose_b_pdf = tf.boolean_mask(mask_z_pdf, mask_lose)  # (None_lose, price_step)

    # Price Distribution
    y_pred_win = tf.boolean_mask(y_pred, mask_win)  # (None_win, price_step)
    y_pred_lose = tf.boolean_mask(y_pred, mask_lose)  # (None_lose, price_step)

    # Loss
    zeros = tf.zeros(tf.shape(y_pred), tf.float32)  # (None, price_step)
    zeros_win = tf.zeros(tf.shape(y_pred_win), tf.float32)  # (None_win, price_step)
    zeros_lose = tf.zeros(tf.shape(y_pred_lose), tf.float32)  # (None_lose, price_step)
    ones = tf.ones(tf.shape(y_pred), tf.float32)  # (None, price_step)
    ones_win = tf.ones(tf.shape(y_pred_win), tf.float32)  # (None_win, price_step)
    ones_lose = tf.ones(tf.shape(y_pred_lose), tf.float32)  # (None_lose, price_step)

    # loss_1
    loss_1 = - K.sum(
                tf.math.log(tf.clip_by_value(
                    tf.boolean_mask(
                        y_pred_win,
                        mask_win_z_pdf),
                    K.epsilon(),
                    1.)))

    # loss_2_win
    left_neighbourhood_offset = y_true_b_idx_1d - y_true_z_idx_1d
    left_neighbourhood_idx = tf.math.maximum(y_true_z_idx_1d - left_neighbourhood_offset, 0)
    mask_z_neighbourhood_cdf = tf.math.logical_xor(
                                    mask_b_cdf, 
                                    tf.sequence_mask(
                                        left_neighbourhood_idx,
                                        price_step))
    mask_win_z_neighbourhood_cdf = tf.boolean_mask(mask_z_neighbourhood_cdf, mask_win)
    loss_2_win = - K.sum(
                    tf.math.log(tf.clip_by_value(
                        K.sum(
                            tf.where(
                                mask_win_z_neighbourhood_cdf, 
                                y_pred_win, 
                                zeros_win),
                            axis=1),
                        K.epsilon(),
                        1.)))

    # loss_2_lose
    right_neighbourhood_offset = 40
    right_neighbourhood_idx = tf.math.minimum(y_true_b_idx_1d + right_neighbourhood_offset, price_step - 1)
    mask_b_neighbourhood_cdf = tf.math.logical_xor(
                                    tf.math.logical_not(mask_b_cdf), 
                                    tf.math.logical_not(
                                        tf.sequence_mask(right_neighbourhood_idx, price_step)))
    mask_lose_b_neighbourhood_cdf = tf.boolean_mask(mask_b_neighbourhood_cdf, mask_lose)
    loss_2_lose = - K.sum(
                    tf.math.log(tf.clip_by_value(
                        K.sum(
                            tf.where(
                                mask_lose_b_neighbourhood_cdf, 
                                y_pred_lose, 
                                zeros_lose),
                            axis=1),
                        K.epsilon(),
                        1.)))
    # loss_2
    beta = 0.2
    loss_2 = beta * loss_2_win + (1 - beta) * loss_2_lose

    # total loss
    alpha = 0.5
    return alpha * loss_1 + (1 - alpha) * loss_2

In [9]:
### Custom Model ###
# have to define the normalizers separately, or error raises
def normalizer_Ad_slot_width(x):
    return x / 1000
def normalizer_Ad_slot_height(x):
    return x / 600

def get_model(learning_rate, l2_reg, b_start, b_limit, b_delta):
    price_bucket_num = int(math.floor((b_limit - b_start + K.epsilon()) / b_delta))
    ### Input Layers ###
    inputs_dict = {}
    inputs_dict['Timestamp'] = Input(shape=(1,), name='Timestamp', dtype='int32')
    inputs_dict['iPinYou_ID'] = Input(shape=(1,), name='iPinYou_ID', dtype='string')
    inputs_dict['User-Agent'] = Input(shape=(1,), name='User-Agent', dtype='string')
    inputs_dict['IP'] = Input(shape=(1,), name='IP', dtype='string')
    inputs_dict['Region'] = Input(shape=(1,), name='Region', dtype='int32')
    inputs_dict['City'] = Input(shape=(1,), name='City', dtype='int32')
    inputs_dict['Ad_exchange'] = Input(shape=(1,), name='Ad_exchange', dtype='string')
    inputs_dict['Domain'] = Input(shape=(1,), name='Domain', dtype='string')
    inputs_dict['URL'] = Input(shape=(1,), name='URL', dtype='string')
    inputs_dict['Ad_slot_width'] = Input(shape=(1,), name='Ad_slot_width', dtype='int32')
    inputs_dict['Ad_slot_height'] = Input(shape=(1,), name='Ad_slot_height', dtype='int32')
    inputs_dict['Ad_slot_visibility'] = Input(shape=(1,), name='Ad_slot_visibility', dtype='string')
    inputs_dict['Ad_slot_format'] = Input(shape=(1,), name='Ad_slot_format', dtype='string')
    inputs_dict['Ad_slot_floor_price'] = Input(shape=(1,), name='Ad_slot_floor_price', dtype='int32')
    inputs_dict['Creative_ID'] = Input(shape=(1,), name='Creative_ID', dtype='string')
    
    ### Feature Column Layer ###
    feature_columns = []
    for col in ['iPinYou_ID', 'User-Agent', 'IP', 'Domain', 'URL', 'Ad_exchange', 'Ad_slot_format', 'Ad_slot_visibility', 'Creative_ID']:
        feature_columns.append(
            feature_column.embedding_column(
                feature_column.categorical_column_with_hash_bucket(
                    col, 
                    hash_bucket_size=16), 
                dimension=4
            )
        )
    for col in ['Timestamp', 'Region', 'City']:
        feature_columns.append(
            feature_column.embedding_column(
                feature_column.categorical_column_with_hash_bucket(
                    col, 
                    hash_bucket_size=30,
                    dtype=tf.int32), 
                dimension=4
            )
        )
    feature_columns.append(
        feature_column.numeric_column(
            'Ad_slot_width', 
            normalizer_fn=normalizer_Ad_slot_width
        )
    )
    feature_columns.append(
        feature_column.numeric_column(
            'Ad_slot_height', 
            normalizer_fn=normalizer_Ad_slot_height
        )
    )
    feature_columns.append(
        feature_column.numeric_column(
            'Ad_slot_floor_price',
            default_value=0
        )
    )
    raw_tensor = DenseFeatures(feature_columns, name='DenseFeatures')(inputs_dict)
    
    ### 1-Order Feature Extractor ###
    x_o1_tensor = raw_tensor
    ### High-Order Feature Extractor ###
    x_oh_tensor = Dense(price_bucket_num/4, activation='relu', kernel_regularizer=regularizers.l2(l2_reg), name='oh_Dense_1')(x_o1_tensor)
    x_oh_tensor = Dense(price_bucket_num/2, activation='relu', kernel_regularizer=regularizers.l2(l2_reg), name='oh_Dense_2')(x_oh_tensor)
    
    ### Output Layer ###
    output_tensor = Concatenate(axis=1)([x_o1_tensor, x_oh_tensor])
    output_tensor = Dense(price_bucket_num, kernel_regularizer=regularizers.l2(l2_reg), name='concat_Dense')(output_tensor)
    output_tensor = Softmax(name='ground_truth')(output_tensor)
    
    model = Model(inputs=[v for v in inputs_dict.values()], 
                  outputs=[output_tensor])
    optimizer = optimizers.Adam(lr=learning_rate)
    model.compile(
        loss=neighbourhood_likelihood_loss,
        optimizer=optimizer,
        experimental_run_tf_function=False
    )
    return model

In [10]:
model = get_model(learning_rate, l2_reg, B_START, B_LIMIT, B_DELTA)

  super(Adam, self).__init__(name, **kwargs)


In [11]:
callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.1, 
                                         verbose=1, mode='min', patience=2, 
                                         restore_best_weights=True),
        # Reduce the learning rate if training plateaues
        tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)
    ]

history = model.fit(training_dataset,
                    steps_per_epoch = steps_per_epoch,
                    validation_data = validation_dataset,
                    validation_steps = validation_steps,
                    verbose = 2,
                    callbacks = callbacks,
                    epochs = epochs)

Epoch 1/10
10332/10332 - 298s - loss: 1555.8823 - val_loss: 1828.9076 - lr: 0.0010 - 298s/epoch - 29ms/step
Epoch 2/10
10332/10332 - 295s - loss: 1447.3156 - val_loss: 1711.9823 - lr: 0.0010 - 295s/epoch - 29ms/step
Epoch 3/10
10332/10332 - 288s - loss: 1440.8850 - val_loss: 1681.2432 - lr: 0.0010 - 288s/epoch - 28ms/step
Epoch 4/10
10332/10332 - 291s - loss: 1430.2939 - val_loss: 1823.0765 - lr: 0.0010 - 291s/epoch - 28ms/step
Epoch 5/10
Restoring model weights from the end of the best epoch: 3.
10332/10332 - 286s - loss: 1417.9211 - val_loss: 1786.2418 - lr: 0.0010 - 286s/epoch - 28ms/step
Epoch 5: early stopping


In [13]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Ad_exchange (InputLayer)       [(None, 1)]          0           []                               
                                                                                                  
 Ad_slot_floor_price (InputLaye  [(None, 1)]         0           []                               
 r)                                                                                               
                                                                                                  
 Ad_slot_format (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 Ad_slot_height (InputLayer)    [(None, 1)]          0           []                           

In [17]:
!pwd
!mkdir model
!ls

/home/ubuntu/Jampp/Arbitrary_Distribution_Modeling
README.md  ipinyou_example.ipynb  model  nll_adm.py  yoyi_example.ipynb


In [18]:
model.save('model/ADM')

2022-07-21 14:49:33.929698: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: model/ADM/assets


INFO:tensorflow:Assets written to: model/ADM/assets


In [5]:
COLS.update({
    'no_use_1': 'object',
    'no_use_2': 'object'
})
test_dataset = get_csv_dataset(args['test'], 
                               shuffle=False, 
                               batch_size=batch_size,
                               drop_remainder=False, 
                               processes_per_host=process_per_host,
                               repeat=False)

/home/ubuntu/Jampp/data/testing2nd/leaderboard.test.data.20130613_15.txt.bz2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2521630 entries, 0 to 2521629
Data columns (total 17 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   Timestamp            int64 
 1   iPinYou_ID           object
 2   User-Agent           object
 3   IP                   object
 4   Region               int64 
 5   City                 int64 
 6   Ad_exchange          object
 7   Domain               object
 8   URL                  object
 9   Ad_slot_width        int64 
 10  Ad_slot_height       int64 
 11  Ad_slot_visibility   object
 12  Ad_slot_format       object
 13  Ad_slot_floor_price  int64 
 14  Creative_ID          object
 15  Bidding_price        int64 
 16  Paying_price         int64 
dtypes: int64(8), object(9)
memory usage: 327.1+ MB


2022-07-21 14:55:25.523038: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [24]:
from keras.models import load_model
def normalizer_Ad_slot_width(x):
    return x / 1000
def normalizer_Ad_slot_height(x):
    return x / 600

new_model = load_model('model/ADM', custom_objects={
    'neighbourhood_likelihood_loss': neighbourhood_likelihood_loss,
    'normalizer_Ad_slot_width': normalizer_Ad_slot_width,
    'normalizer_Ad_slot_height': normalizer_Ad_slot_height
})

In [17]:
pdf = new_model.predict(test_dataset)

2022-07-21 15:06:44.965951: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3025956000 exceeds 10% of free system memory.


In [19]:
# calculate prediction price, index of prediction price
p_idx = []
p = []
for record in pdf:
    record = record.tolist()
    # calculate the index of prediction price
    idx = record.index(max(record))
    p_idx.append(idx)
    # calculate prediction price
    p.append(B_START + idx * B_DELTA)

# load ground truth of bidding price and winning price, calculate their index
gt = pd.read_csv(args['test'], names=COLS, header=None, sep='\t', low_memory=False)
b = gt['Bidding_price'].tolist()
z = gt['Paying_price'].tolist()
b_idx = [min(math.floor((x - B_START) / B_DELTA), MAX_IDX) for x in b]
z_idx = [min(math.floor((x - B_START) / B_DELTA), MAX_IDX) for x in z]

In [20]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_auc_score

def evaluate(pdf, p, p_idx, b, b_idx, z, z_idx):
    r = []  # the result of prediction (win or lose)
    anlp = 0
    wr_p = []
    value = 0
    for i in range(len(pdf)):
        # number of wins
        r.append(1 if p[i]>z[i] else 0)
        # anlp
        anlp += math.log(pdf[i][z_idx[i]])
#         anlp += math.log(max(pdf[i][z_idx[i]], 1e-10))
        # c-index
        wr_p.append(sum(pdf[i][0:p_idx[i]]))
        # value
        value += z[i] * r[i]

    print('Number of wins =', sum(r), '/', len(r), ', {:.2f}%'.format(sum(r)/len(r)*100))
    mae = mean_absolute_error(p, z)
    print('MAE = {:.2f}'.format(mae))
    print('ANLP =', str(-anlp/len(pdf)))
    c_index = roc_auc_score(r, wr_p)
    print("C-Index = {:.4f}".format(c_index))
    print('Value = {:.2f}'.format(value/sum(r)))

In [21]:
advertiser_key = 3476  # 1458, 3358, 3386, 3427, 3476, 2259, 2261, 2821, 2997
advertiser_key_list = gt["Advertiser_ID"].tolist()

pdf_advertiser = []
p_advertiser = []
p_idx_advertiser = []
b_advertiser = []
b_idx_advertiser = []
z_advertiser = []
z_idx_advertiser = []
for i in range(len(advertiser_key_list)):
    if advertiser_key_list[i] == advertiser_key:
        pdf_advertiser.append(pdf[i])
        p_advertiser.append(p[i])
        p_idx_advertiser.append(p_idx[i])
        b_advertiser.append(b[i])
        b_idx_advertiser.append(b_idx[i])
        z_advertiser.append(z[i])
        z_idx_advertiser.append(z_idx[i])
evaluate(pdf_advertiser, p_advertiser, p_idx_advertiser, b_advertiser, b_idx_advertiser, z_advertiser, z_idx_advertiser)

Number of wins = 127677 / 523848 , 24.37%
MAE = 39.11
ANLP = 3.7506132376416796
C-Index = 0.8885
Value = 60.32


## Medición de tiempos de predicción