# Estimating Click Through Rate for Real-Time Bidding
iPinYou: Global Bidding Algorithm Competition
http://contest.ipinyou.com/introduction.shtml
http://contest.ipinyou.com/manual.shtml
http://contest.ipinyou.com/data

<img class="avia_image " src="http://www.mediative.com/wp-content/uploads/2015/01/REal-time-bidding.png" alt="Real time bidding" title="Real time bidding" itemprop="contentURL">
http://www.mediative.com/wp-content/uploads/2015/01/REal-time-bidding.png

In [None]:
import sys
import random
import math
import operator
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import mean_squared_error
import sklearn
import sklearn.datasets
import sklearn.linear_model
import os.path
from IPython.core.debugger import Tracer
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
%matplotlib inline
import time

In [None]:
campaign="2997"
#campaign="3358"
folder='../../make-ipinyou-data'
resultsfolder='../results'
trainYzxFile=os.path.join(folder,campaign,'train.yzx.txt')
testYzxFile=os.path.join(folder,campaign,'test.yzx.txt')

In [None]:
bufferCaseNum = 1000000
eta = 0.01 # learning rate
lamb = 1E-6 # regularization parameter
featWeight = {}
random.seed(10)
initWeight = 0.05

def nextInitWeight():
    return (random.random() - 0.5) * initWeight
def ints(s):
    res = []
    for ss in s:
        res.append(int(ss))
    return res
def sigmoid(p):
    return 1.0 / (1.0 + math.exp(-p))

# Training: LOGISTIC REGRESSION

In [None]:
fi = open(trainYzxFile, 'r')
bufLineNum = 0
trainData = []
y = []
yp = []
time1 = time.time()
for line in fi:
    bufLineNum = (bufLineNum + 1) % bufferCaseNum
    trainData.append(ints(line.replace(":1", "").split()))
    if bufLineNum == 0:
        for data in trainData:
            clk = data[0]
            mp = data[1]
            fsid = 2 # feature start id
            # predict
            pred = 0.0
            for i in range(fsid, len(data)):
                feat = data[i]
                if feat not in featWeight:
                    featWeight[feat] = nextInitWeight()
                pred += featWeight[feat]
            pred = sigmoid(pred)
            y.append(clk)
            yp.append(pred)
            # start to update weight
            # w_i = w_i + learning_rate * [ (y - p) * x_i - lamb * w_i ] 
            for i in range(fsid, len(data)):
                feat = data[i]
                featWeight[feat] = featWeight[feat] * (1 - lamb) + eta * (clk - pred)
        trainData = []
# complete iterations when nr lines < bufferCaseNum
if len(trainData) > 0:
    for data in trainData:
        clk = data[0] # click
        mp = data[1] # market price
        fsid = 2 # feature start id
        # predict
        pred = 0.0
        for i in range(fsid, len(data)):
            feat = data[i]
            if feat not in featWeight:
                featWeight[feat] = nextInitWeight()
            pred += featWeight[feat]
        pred = sigmoid(pred)
        y.append(clk)
        yp.append(pred)
        # start to update weight
        # w_i = w_i + learning_rate * [ (y - p) * x_i - lamb * w_i ]
        for i in range(fsid, len(data)):
            feat = data[i]
            featWeight[feat] = featWeight[feat] * (1 - lamb) + eta * (clk - pred)
fi.close()
time2 = time.time()
print 'LR duration:' + '\t' + str(time2 - time1)
auc = roc_auc_score(y, yp)
rmse = math.sqrt(mean_squared_error(y, yp))
print 'LR Train: ' + '\t' + str(auc) + '\t' + str(rmse)
fpr, tpr, thresholds = roc_curve(y, yp)

In [None]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('LR Training Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

# Training: NEURAL NETS

In [None]:
firstFeatNr = 27
nrFeat = len(featWeight)
print 'Nr features: ' + str(nrFeat)
nn_input_dim = nrFeat
nn_hdim = 10
nn_output_dim = 2
num_passes = 1

# Gradient descent parameters
epsilon = 0.01 # learning rate
reg_lambda = 0.01 # regularization

bufferCaseNum = 100
num_examples = bufferCaseNum # training set size

In [None]:
np.random.seed(0)
W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
b1 = np.zeros((1, nn_hdim))
W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
b2 = np.zeros((1, nn_output_dim))
fi = open(trainYzxFile, 'r')
lineNum = 0
trainData = []
y = []
yp = []
nrLines = 0
time1 = time.time()
for line in fi:
    nrLines += 1
    lineNum = (lineNum + 1) % bufferCaseNum
    trainData.append(ints(line.replace(":1", "").split()))
    if lineNum == 0:
        X = np.zeros((bufferCaseNum,nrFeat))
        dataLineNum = 0
        for data in trainData:
            clk = data[0]
            y.append(clk)
            mp = data[1]
            #fsid = 2 # feature start id 
            fsid = 3 # as all data[2]=0!?!
            for i in range(fsid, len(data)):
                X[dataLineNum,data[i] - firstFeatNr] = 1
            dataLineNum += 1
        for i in xrange(0, num_passes):
            # Forward propagation
            z1 = X.dot(W1) + b1
            a1 = np.tanh(z1)
            z2 = a1.dot(W2) + b2
            exp_scores = np.exp(z2)
            probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
            if i == num_passes - 1:
                yp.extend(probs[:,1])
            # Backpropagation
            delta3 = probs
            delta3[range(num_examples), clk] -= 1
            dW2 = (a1.T).dot(delta3)
            db2 = np.sum(delta3, axis=0, keepdims=True)
            delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
            dW1 = np.dot(X.T, delta2)
            db1 = np.sum(delta2, axis=0)

            # Add regularization terms (b1 and b2 don't have regularization terms)
            dW2 += reg_lambda * W2
            dW1 += reg_lambda * W1

            # Gradient descent parameter update
            W1 += -epsilon * dW1
            b1 += -epsilon * db1
            W2 += -epsilon * dW2
            b2 += -epsilon * db2
            #print 'Iter:' + '\t' + str(i) + '\t' + 'Time:' + '\t' + str(time1 - time0)
        trainData = []
time2 = time.time()
print 'NN duration:' + '\t' + str(time2 - time1)
auc = roc_auc_score(y, yp)
rmse = math.sqrt(mean_squared_error(y, yp))
print 'NN Train: ' + '\t' + str(auc) + '\t' + str(rmse)
fpr, tpr, thresholds = roc_curve(y, yp)
# # complete iterations when nr lines < bufferCaseNum
# if len(trainData) > 0:
#     X = np.zeros((len(trainData),nrFeat))
#     for data in trainData:
#         clk = data[0] # click
#         mp = data[1] # market price
#         #fsid = 2 # feature start id
#         fsid = 3 # as all data[2]=0!?!
#         # predict
#         pred = 0.0
#         for i in range(fsid, len(data)):
#             feat = data[i]
#             if feat not in featWeight:
#                 featWeight[feat] = nextInitWeight()
#             pred += featWeight[feat]
#         pred = sigmoid(pred)
#         y.append(clk)
#         yp.append(pred)
#         # start to update weight
#         # w_i = w_i + learning_rate * [ (y - p) * x_i - lamb * w_i ]
#         for i in range(fsid, len(data)):
#             feat = data[i]
#             featWeight[feat] = featWeight[feat] * (1 - lamb) + eta * (clk - pred)
# auc = roc_auc_score(y, yp)
# rmse = math.sqrt(mean_squared_error(y, yp))
# print 'NN Train: ' + '\t' + str(auc) + '\t' + str(rmse)
# # close trainYzxFile
fi.close()

In [None]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('NN Training Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

# Testing: LOGISTIC REGRESSION

num_passes = 1
NN Train: 	0.530687996247	0.0910345737046

In [None]:
y = []
yp = []
fi = open(testYzxFile, 'r')
for line in fi:
    data = ints(line.replace(":1", "").split())
    clk = data[0] # click
    mp = data[1] # market price
    fsid = 2 # feature start id
    pred = 0.0
    for i in range(fsid, len(data)):
        feat = data[i]
        if feat in featWeight:
            pred += featWeight[feat]
    pred = sigmoid(pred)
    y.append(clk)
    yp.append(pred)
fi.close()
auc = roc_auc_score(y, yp)
rmse = math.sqrt(mean_squared_error(y, yp))
print 'LR Test: ' + '\t' + str(round) + '\t' + str(auc) + '\t' + str(rmse)
fpr, tpr, thresholds = roc_curve(y, yp)

# output the weights
fo = open(trainYzxFile + '.lr.weight', 'w')
featvalue = sorted(featWeight.iteritems(), key=operator.itemgetter(0))
for fv in featvalue:
    fo.write(str(fv[0]) + '\t' + str(fv[1]) + '\n')
fo.close()


# output the prediction
fi = open(testYzxFile, 'r')
fo = open(testYzxFile + '.lr.pred', 'w')

for line in fi:
    data = ints(line.replace(":1", "").split())
    pred = 0.0
    for i in range(1, len(data)):
        feat = data[i]
        if feat in featWeight:
            pred += featWeight[feat]
    pred = sigmoid(pred)
    fo.write(str(pred) + '\n')    
fo.close()
fi.close()

In [None]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('LR Testing Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()