# Import Data

In [73]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pickle
import time
import os.path
import math
from xgboost import XGBClassifier
from sklearn.externals import joblib
import pandas as pd

n_bytes = 2**31
max_bytes = 2**31 - 1

class Time_Tracking():
    
    start_time = None
    
    def start_tracking(self):
        
        self.start_time = time.time()
    
    def stop_tracking(self):
        
        print("Time used:", round(((time.time() - self.start_time)/60),2), ' minutes')


def load_pickle(file_path):
    
    bytes_in = bytearray(0)
    input_size = os.path.getsize(file_path)
    with open(file_path, 'rb') as f_in:
        for _ in range(0, input_size, max_bytes):
            bytes_in += f_in.read(max_bytes)
    
    return pickle.loads(bytes_in)


In [74]:
model = joblib.load("xgb.pickle.dat")

train = load_pickle('train_df_after_preprocessing.pkl')
valid = load_pickle('valid_df_after_preprocessing.pkl')
test = load_pickle('test_df_after_preprocessing.pkl')

valid_click = valid['click']
valid_bidprice = valid['bidprice']
valid_payprice = valid['payprice']
valid_X = valid.drop(['click', 'bidprice', 'payprice'], axis = 1)

### 暂时没时间 tune xgboost, 先 tune 各种strategy

In [75]:
# strategy 1: linear bidding(base_line)
# strategy 2: OCTR
# strategy 3: threshold linear bidding 
# strategy 4: linear bidding with floor price 
# strategy 5: sigmoid threshold bid 

# threshold 可能效果不是很好，因为 threshold = 0.5 的时候，failed = 61, 

In [76]:
valid_ctr_prediction = model.predict_proba(valid_X)

In [77]:
def EvaluateClick( bid_price, pCTR, budget, valid_payprice, valid_click):
    
    auction = bid_price >= valid_payprice
    win_bid_ix = [index for index, auction in enumerate(auction) if auction]

    clicks = sum(valid_click[win_bid_ix])
    spend = sum(valid_payprice[win_bid_ix])
    num_impress = len(win_bid_ix)

    sorted_pCTR_index = sorted(range(len(auction)), key=lambda k: pCTR[k])

    if spend > budget:
        print('overspend')
        for bid_id in sorted_pCTR_index:

            spend -= bid_price[bid_id]
            clicks -= valid_click[bid_id]
            num_impress -=1

            if spend <= budget:
                break

    print('spend:', spend, ' click:', clicks, 'impression:', num_impress)
    return spend,clicks, num_impress, bid_price


pCTR = valid_ctr_prediction[:,1]
bid_price = 10*pCTR
EvaluateClick( bid_price, pCTR, 6250000, valid_payprice, valid_click);

spend: 9273  click: 4 impression: 2249


In [78]:
# search the best base_price
base_prices = np.arange(1640,1660, 1)
spend = []
clicks = []

for base in base_prices:
    
    pCTR = valid_ctr_prediction[:,1]
    bid_price = base*pCTR
    s,c,_,bid_prices = EvaluateClick( bid_price, pCTR, 6250000,valid_payprice, valid_click)
    spend.append(s)
    clicks.append(c)
    if c == 202:
        print('best base:',base)
        print(bid_prices)
        break

overspend
spend: 6249782.10552  click: 201 impression: 233651
overspend
spend: 6249738.75554  click: 201 impression: 233685
overspend
spend: 6249952.90175  click: 201 impression: 233719
overspend
spend: 6249971.40439  click: 201 impression: 233753
overspend
spend: 6249965.45554  click: 202 impression: 233786
best base: 1644
[ 400.16662598  254.08079529  292.98690796 ...,  873.49316406  306.6690979
  398.89987183]


### The best constant is 1644.

In [79]:
max(bid_prices)

1638.4725

In [80]:
### prediction
test_prediction = model.predict_proba(test)[:,1]

budget = 6250000
base_price = 10
bid_price = base_price*test_prediction/0.0007
sorted_pCTR_index = sorted(range(len(bid_price)), key=lambda k: bid_price[k])
spend = sum(bid_price)

if spend > budget:
    print('overspend')
    for bid_id in sorted_pCTR_index:

        spend -= bid_price[bid_id]
        bid_price[bid_id] = 0
        
        if spend <= budget:
            break

print('spend:', spend)


submission = pd.read_csv('Group_xx.csv')
submission['bidprice'] = bid_price
submission.to_csv('Group_9.csv')

overspend
spend: 6249347.18262


### As we can see, the linear bidding is good at getting golden bid( absolute clicks ), so the bidding price can be really high. This may helpful in the multinomial case. But this may leads to huge left of budget. 

# Threshold sigmoid bidding: 1/( 1+exp^(-z) )

In [76]:
sigmoid = [ 1/( 1 + math.exp(-(z-0.5))) for z in pCTR ]
bid_price = [300*s for s in sigmoid]

In [86]:
# search the best max_bid_price
base_prices =  np.arange(620,623, 1)
spend = []
clicks = []

for base in base_prices:
    
    bid_price = [base*s for s in sigmoid]
    s,c,_,bid_prices = EvaluateClick( bid_price, pCTR, 6250000,valid_payprice, valid_click)
    spend.append(s)
    clicks.append(c)
    if c == 202:
        print('best base:',base)
        print(bid_prices)
        break

overspend
spend: 6249806.33884  click: 202 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### The best constant is 620. 

In [91]:
### prediction
test_prediction = model.predict_proba(test)[:,1]
sigmoid = [ 1/( 1 + math.exp(-(z-0.5))) for z in test_prediction ]

budget = 6250000
base_price = 620
bid_price = [base_price*s for s in sigmoid]
sorted_pCTR_index = sorted(range(len(bid_price)), key=lambda k: bid_price[k])
spend = sum(bid_price)

if spend > budget:
    print('overspend')
    for bid_id in sorted_pCTR_index:

        spend -= bid_price[bid_id]
        bid_price[bid_id] = 0
        
        if spend <= budget:
            break

print('spend:', spend)

overspend
spend: 6249699.0989470435


In [94]:
submission = pd.read_csv('Group_xx.csv')
submission['bidprice'] = bid_price
submission.to_csv('Group_9.csv')

In [96]:
max(submission['bidprice'])

385.6756695093367

# Save prediction

In [13]:
submission = pd.read_csv('Group_xx.csv')
submission['bidprice'] = bid_price
submission.to_csv('Group_9.csv')