In [1]:
import pandas as pd
from itertools import groupby
import numpy as np
from collections import Counter
from imblearn.over_sampling import  SMOTE 
from sklearn.metrics import roc_auc_score

from sklearn import preprocessing
#Set the random seed
np.random.seed(12)
# Initialize label encoder
label_encoder = preprocessing.LabelEncoder()

# import data

In [2]:
df_train = pd.read_csv('train.csv')
df_val = pd.read_csv('validation.csv')
df_train = df_train[df_train.payprice <= df_train.bidprice]
df_test = pd.read_csv('test.csv')

# feature engineering

In [3]:
df_train_Browser = pd.DataFrame(df_train.useragent.str.split('_',1).tolist(),
                                   columns = ['OS','browser'])
df_val_Browser = pd.DataFrame(df_val.useragent.str.split('_',1).tolist(),
                                   columns = ['OS','browser'])
df_test_Browser = pd.DataFrame(df_test.useragent.str.split('_',1).tolist(),
                                   columns = ['OS','browser'])

In [4]:
df_train_click = pd.DataFrame(df_train.click.tolist(),columns=['click'])
df_val_click = pd.DataFrame(df_val.click.tolist(),columns=['click'])
df_test_click = pd.DataFrame(df_val.click.tolist(),columns=['click'])

In [5]:
df_train_domain = pd.DataFrame(df_train.domain.tolist(),columns=['domain'])
df_val_domain = pd.DataFrame(df_val.domain.tolist(),columns=['domain'])
df_test_domain = pd.DataFrame(df_test.domain.tolist(),columns=['domain'])
df_train_domain["domain"] = label_encoder.fit_transform(df_train_domain["domain"])
df_val_domain["domain"] = label_encoder.fit_transform(df_val_domain["domain"])
df_test_domain["domain"] = label_encoder.fit_transform(df_test_domain["domain"])

In [6]:
df_train_slotformat = pd.DataFrame(df_train.slotformat.tolist(),columns=['slotformat'])
df_val_slotformat = pd.DataFrame(df_val.slotformat.tolist(),columns=['slotformat'])
df_test_slotformat = pd.DataFrame(df_val.slotformat.tolist(),columns=['slotformat'])
df_train_slotformat["slotformat"] = label_encoder.fit_transform(df_train_slotformat["slotformat"])
df_val_slotformat["slotformat"] = label_encoder.fit_transform(df_val_slotformat["slotformat"])
df_test_slotformat["slotformat"] = label_encoder.fit_transform(df_test_slotformat["slotformat"])

In [7]:
df_train_slotvisibility = pd.DataFrame(df_train.slotvisibility.tolist(),columns=['slotvisibility'])
df_val_slotvisibility = pd.DataFrame(df_val.slotvisibility.tolist(),columns=['slotvisibility'])
df_test_slotvisibility = pd.DataFrame(df_val.slotvisibility.tolist(),columns=['slotvisibility'])

df_train_slotvisibility["slotvisibility"] = label_encoder.fit_transform(df_train_slotvisibility["slotvisibility"])
df_val_slotvisibility["slotvisibility"] = label_encoder.fit_transform(df_val_slotvisibility["slotvisibility"])
df_test_slotvisibility["slotvisibility"] = label_encoder.fit_transform(df_test_slotvisibility["slotvisibility"])

In [8]:
df_train_slotwidth = pd.DataFrame(df_train.slotwidth.tolist(),columns=['slotwidth'])
df_val_slotwidth = pd.DataFrame(df_val.slotwidth.tolist(),columns=['slotwidth'])
df_test_slotwidth = pd.DataFrame(df_test.slotwidth.tolist(),columns=['slotwidth'])

In [9]:
df_train_slotheight = pd.DataFrame(df_train.slotheight.tolist(),columns=['slotheight'])
df_val_slotheight = pd.DataFrame(df_val.slotheight.tolist(),columns=['slotheight'])
df_test_slotheight = pd.DataFrame(df_test.slotheight.tolist(),columns=['slotheight'])

In [10]:
df_train_slotprice = pd.DataFrame(df_train.slotprice.tolist(),columns=['slotprice'])
df_val_slotprice = pd.DataFrame(df_val.slotprice.tolist(),columns=['slotprice'])
df_test_slotprice = pd.DataFrame(df_test.slotprice.tolist(),columns=['slotprice'])

In [11]:
df_train_weekday = pd.get_dummies(df_train['weekday'],prefix='weekday')
df_train_hour = pd.get_dummies(df_train['hour'],prefix='hour')
df_train_advertiser = pd.get_dummies(df_train['advertiser'],prefix='advertiser')
df_train_OS = pd.get_dummies(df_train_Browser['OS'],prefix='OS')
df_train_browser = pd.get_dummies(df_train_Browser['browser'],prefix='browser')

In [12]:
df_val_weekday = pd.get_dummies(df_val['weekday'],prefix='weekday')
df_val_hour = pd.get_dummies(df_val['hour'],prefix='hour')
df_val_advertiser = pd.get_dummies(df_val['advertiser'],prefix='advertiser')
df_val_OS = pd.get_dummies(df_val_Browser['OS'],prefix='OS')
df_val_browser = pd.get_dummies(df_val_Browser['browser'],prefix='browser')

In [13]:
df_test_weekday = pd.get_dummies(df_test['weekday'],prefix='weekday')
df_test_hour = pd.get_dummies(df_test['hour'],prefix='hour')
df_test_advertiser = pd.get_dummies(df_test['advertiser'],prefix='advertiser')
df_test_OS = pd.get_dummies(df_test_Browser['OS'],prefix='OS')
df_test_browser = pd.get_dummies(df_test_Browser['browser'],prefix='browser')

In [14]:
df_train_dummy = df_train_click.join(df_train_weekday)
df_train_dummy = df_train_dummy.join(df_train_hour)
df_train_dummy = df_train_dummy.join(df_train_advertiser)
df_train_dummy = df_train_dummy.join(df_train_OS)
df_train_dummy = df_train_dummy.join(df_train_browser)

In [15]:
df_val_dummy = df_val_click.join(df_val_weekday)
df_val_dummy = df_val_dummy.join(df_val_hour)
df_val_dummy = df_val_dummy.join(df_val_advertiser)
df_val_dummy = df_val_dummy.join(df_val_OS)
df_val_dummy = df_val_dummy.join(df_val_browser)

In [16]:
df_test_dummy = df_test_click.join(df_test_weekday)
df_test_dummy = df_test_dummy.join(df_test_hour)
df_test_dummy = df_test_dummy.join(df_test_advertiser)
df_test_dummy = df_test_dummy.join(df_test_OS)
df_test_dummy = df_test_dummy.join(df_test_browser)

In [17]:
df_train_dummy = df_train_dummy.join(df_train_domain)
df_train_dummy = df_train_dummy.join(df_train_slotwidth)
df_train_dummy = df_train_dummy.join(df_train_slotheight)
df_train_dummy = df_train_dummy.join(df_train_slotprice)
df_train_dummy = df_train_dummy.join(df_train_slotformat)
df_train_dummy = df_train_dummy.join(df_train_slotvisibility)

In [18]:
df_val_dummy = df_val_dummy.join(df_val_domain)
df_val_dummy = df_val_dummy.join(df_val_slotwidth)
df_val_dummy = df_val_dummy.join(df_val_slotheight)
df_val_dummy = df_val_dummy.join(df_val_slotprice)
df_val_dummy = df_val_dummy.join(df_val_slotformat)
df_val_dummy = df_val_dummy.join(df_val_slotvisibility)

In [19]:
df_test_dummy = df_test_dummy.join(df_test_domain)
df_test_dummy = df_test_dummy.join(df_test_slotwidth)
df_test_dummy = df_test_dummy.join(df_test_slotheight)
df_test_dummy = df_test_dummy.join(df_test_slotprice)
df_test_dummy = df_test_dummy.join(df_test_slotformat)
df_test_dummy = df_test_dummy.join(df_test_slotvisibility)

In [20]:
df_train_dummy[1:5]

Unnamed: 0,click,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,hour_0,hour_1,...,browser_other,browser_safari,browser_sogou,browser_theworld,domain,slotwidth,slotheight,slotprice,slotformat,slotvisibility
1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,22143,250,250,5,0,1
2,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,18804,336,280,0,1,0
3,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,20130,728,90,162,0,0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,19435,950,90,0,1,0


In [21]:
df_val_dummy[1:5]

Unnamed: 0,click,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,hour_0,hour_1,...,browser_other,browser_safari,browser_sogou,browser_theworld,domain,slotwidth,slotheight,slotprice,slotformat,slotvisibility
1,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,6315,320,50,118,3,5
2,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,810,1000,90,70,0,0
3,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3159,1000,90,70,0,0
4,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6315,336,280,5,0,2


In [22]:
df_test_dummy[1:5]

Unnamed: 0,click,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,hour_0,hour_1,...,browser_other,browser_safari,browser_sogou,browser_theworld,domain,slotwidth,slotheight,slotprice,slotformat,slotvisibility
1,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1806,1000,90,80,3,5
2,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,582,1000,90,20,0,0
3,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8307,336,280,0,0,0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1806,1000,90,31,0,2


# feature selection

In [23]:
df_train_dummy_dropna = df_train_dummy.dropna()
df_val_dummy_dropna = df_val_dummy.dropna()

In [24]:
features_gbdt = [x for x in df_train_dummy.columns if x!="click"]

In [25]:
X_train = df_train_dummy_dropna[features_gbdt]
Y_train = df_train_dummy_dropna['click']

X_val = df_val_dummy_dropna[features_gbdt]
Y_val = df_val_dummy_dropna['click']

In [26]:
X_train.dropna()

Unnamed: 0,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,hour_0,hour_1,hour_2,...,browser_other,browser_safari,browser_sogou,browser_theworld,domain,slotwidth,slotheight,slotprice,slotformat,slotvisibility
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,20677,468,60,5,0,1
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,22143,250,250,5,0,1
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,18804,336,280,0,1,0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,20130,728,90,162,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,19435,950,90,0,1,0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,19239,300,250,0,1,1
6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1656,728,90,5,3,8
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,61,960,90,0,3,5
8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,22199,336,280,5,0,2
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4843,1000,90,31,0,0


# XGBoost

In [27]:
features_xgb = [x for x in df_train_dummy.columns if (x!="click" and x!="city_217" and x!= "219" and x!= "1")]

In [28]:
len(features_xgb)

61

In [29]:
X_train_xgb = df_train_dummy_dropna[features_xgb]
Y_train_xgb = df_train_dummy_dropna['click']

X_val_xgb = df_val_dummy_dropna[features_xgb]
Y_val_xgb = df_val_dummy_dropna['click']

X_test_xgb = df_val_dummy_dropna[features_xgb]

In [30]:
#X_train_res_xgb, Y_train_res_xgb = sm.fit_sample(X_train_xgb, Y_train_xgb)

In [31]:
import xgboost as xgb

xgbclf = xgb.XGBClassifier(max_depth=5, n_estimators=100)




In [32]:
xgbclf = xgbclf.fit(X_train_xgb,Y_train_xgb)

In [33]:
xgbclf

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [34]:

Y_pred_xgb_prob = xgbclf.predict_proba(X_val_xgb)
aucxgb = roc_auc_score(Y_val_xgb, Y_pred_xgb_prob[:,1])
aucxgb

0.71012515356644201

In [35]:
from sklearn.metrics import mean_squared_error, make_scorer
mse = mean_squared_error(Y_val, Y_pred_xgb_prob[:,1], sample_weight=None, multioutput='uniform_average')
mse

0.00075221747098574196

# bidding strategy

# linear bidding strategy

In [36]:
pCTR = Y_pred_xgb_prob
# Average CTR
avgCTR = (df_train.click.sum() / df_train.logtype.sum()) 
avgCTR

0.0007454510034874044

In [37]:
# Linear bids for the bid range
base_bids = np.arange(2,302,2)

bids = []
for base_bid in base_bids:
    for p in pCTR[:,1]:
        bid = base_bid * p / avgCTR
        bids.append(bid)

bid_chunks = [bids[x:x+len(pCTR)] for x in range(0, len(bids), len(pCTR))]

In [38]:
def linear_bidding(bids):
    impression = 0.0
    clicks = 0
    cost = 0.0
    budget = 6250*1000
    
    bool_check = bids >= df_val.payprice
    for i in range(0,len(bool_check)):
            if bool_check[i]==True:
                impression += 1.0
                clicks += df_val.click[i]
                cost += df_val.payprice[i]
            if cost >= budget:
                break
    return impression, clicks, cost

lin = pd.DataFrame()
lin['bid'] = base_bids

im = []
clks = []
ct = []

for bids in bid_chunks:
    [imps, clicks, cost] = linear_bidding(bids)
    im.append(imps)
    clks.append(clicks)
    ct.append(cost)
lin['imps_won'] = im
lin['total_spend'] = ct
lin['clicks'] = clks
lin['CTR'] = (lin.clicks/len(df_val) * 100).round(2).astype(str)
lin['CPM'] = (lin.total_spend/len(df_val) * 1000).round(2).astype(str)
lin['CPC'] = (lin.total_spend/lin.clicks).round(2).astype(str)
lin

Unnamed: 0,bid,imps_won,total_spend,clicks,CTR,CPM,CPC
0,2,418.0,976.0,1,0.0,3.26,976.0
1,4,1964.0,10504.0,10,0.0,35.04,1050.4
2,6,3391.0,27372.0,15,0.01,91.32,1824.8
3,8,6471.0,53573.0,16,0.01,178.73,3348.31
4,10,9333.0,85575.0,16,0.01,285.49,5348.44
5,12,12493.0,127202.0,18,0.01,424.36,7066.78
6,14,15083.0,167857.0,22,0.01,559.99,7629.86
7,16,17713.0,210692.0,22,0.01,702.89,9576.91
8,18,20184.0,256378.0,24,0.01,855.31,10682.42
9,20,22788.0,311436.0,26,0.01,1038.99,11978.31


In [39]:
# select base_bid
lista = list(lin['clicks'])
index = lista[lin.clicks.max()]
best_base_bid = lin.bid[index]

In [56]:
df_max = lin[lin.clicks == lin.clicks.max()]

In [57]:
df_max

Unnamed: 0,bid,imps_won,total_spend,clicks,CTR,CPM,CPC
41,84,145124.0,6250046.0,127,0.04,20850.93,49212.96


In [62]:
base_bid = df_max.bid.
base_bid

array([84])

In [41]:
best_base_bid

166

In [55]:
lin[lin.clicks == lin.clicks.max()].bid

41    84
Name: bid, dtype: int64

In [52]:
base_bid

41    84
Name: bid, dtype: int64

# constant bidding strategy

In [66]:
def constant_bidding(constant_bid):
    impression = 0.0
    clicks = 0
    miss_bid=0
    cost = 0.0
    budget = 6250*1000
    
    
    for click, pay_price in df_val[['click','payprice']].values:
        
            if constant_bid<=pay_price or constant_bid>budget:
                miss_bid=miss_bid+1
            if constant_bid > pay_price:
                impression = impression+1.0
                clicks =clicks + click
                cost =cost + pay_price
            if cost >= budget:
                break
    return miss_bid,impression, clicks, cost

const = pd.DataFrame()
const['constants_bidding'] = np.arange(2,501,1)

miss_bidding=[]
impression = []
clicks = []
cost = []
for con in const['constants_bidding']:
    [miss,imps, clks, ct] = constant_bidding(con)
    miss_bidding.append(miss)
    impression.append(imps)
    clicks.append(clks)
    cost.append(ct)
const['miss_bidding']=miss_bidding
const['imps_won'] = impression
const['total_spend'] = cost
const['clicks'] = clicks
const['CTR'] = (const.clicks/const.imps_won * 100).round(2).astype(str)
const['CPM'] = (const.total_spend/const.imps_won * 1000).round(2).astype(str)
const['CPC'] = (const.total_spend/const.clicks).round(2).astype(str)


In [67]:
const[const.clicks == const.clicks.max()]

Unnamed: 0,constants_bidding,miss_bidding,imps_won,total_spend,clicks,CTR,CPM,CPC
88,90,51747,130728.0,6250054.0,85,0.07,47809.6,73530.05
148,150,18569,106459.0,6250030.0,85,0.08,58708.33,73529.76
150,152,17510,105309.0,6250040.0,85,0.08,59349.53,73529.88
151,153,17338,105117.0,6250019.0,85,0.08,59457.74,73529.64
155,157,16496,104179.0,6250027.0,85,0.08,59993.16,73529.73
156,158,16152,103767.0,6250033.0,85,0.08,60231.41,73529.8
157,159,15968,103566.0,6250026.0,85,0.08,60348.24,73529.72
158,160,15834,103404.0,6250004.0,85,0.08,60442.57,73529.46


# random bidding strategy

In [68]:
from random import randrange

def random_bidding(upper_bound):
    impression = 0.0
    clicks = 0
    miss_bid=0
    cost = 0.0
    budget = 6250*1000
    
    for click,pay_price in df_val[['click','payprice']].values:  
        rand_no = randrange(upper_bound)
        if rand_no<=pay_price or rand_no>budget:
            miss_bid=miss_bid+1
        if rand_no > pay_price:
            impression += 1
            clicks += click
            cost += pay_price
        if cost >= budget:
                break
    return miss_bid,impression, clicks, cost
            
randm = pd.DataFrame()
randm['upper_bound'] = np.arange(2,501,1)

miss_bid=[]
impression = []
clicks = []
cost = []
for upper in randm['upper_bound']:
    [miss,imps, clks, ct] = random_bidding(upper)
    miss_bid.append(miss)
    impression.append(imps)
    clicks.append(clks)
    cost.append(ct)
randm['miss_bidding']=miss_bid
randm['imps_won'] = impression
randm['total_spend'] = cost
randm['clicks'] = clicks
randm['CTR'] = (randm.clicks/randm.imps_won * 100).round(5).astype(str)
randm['CPM'] = (randm.total_spend/randm.imps_won * 1000).round(5).astype(str)
randm['CPC'] = (randm.total_spend/randm.clicks).round(5).astype(str)


In [69]:
randm[randm.clicks == randm.clicks.max()]

Unnamed: 0,upper_bound,miss_bidding,imps_won,total_spend,clicks,CTR,CPM,CPC
241,243,54821,109922.0,6250047.0,86,0.07824,56858.92724,72674.96512
