# Keyphrase Performance Prediction for Search Engine Advertisers 

Search engine adversitsers bid for keyphrases based on keyword performance feedback and budget allocation for advertising. Keyword performance is defined by the rank, impressions and clicks received by the advertisement. This performance depends on several hidden factors such as Yahoo's evaluation of the advertisement, competitor bidding strategies and relevance of advertisement.In this project, we have designed a prediction model to estimate keyphrase performance for an advertiser based on 77 Million historical keyphrase performance data release by Yahoo! Webscope for 4-month window.

In [108]:
import gzip
import tarfile
import pickle
import operator
import warnings
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import (Ridge, TheilSenRegressor, RANSACRegressor, HuberRegressor)

from sklearn.ensemble import (RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor, AdaBoostRegressor, 
                              GradientBoostingRegressor)

from sklearn.neighbors import (KNeighborsRegressor)
from sklearn.tree import DecisionTreeRegressor

from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error

%matplotlib inline

In [4]:
# Disable warnings
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", category=DeprecationWarning) 

## Data Scrapping and Formatting

In [85]:
ydata_df = pd.read_pickle('data8Keyphase.p')
ydata_df['day'] = ydata_df['day'].astype('int') 
ydata_df['rank'] = ydata_df['rank'].astype('int')
ydata_df['avg_bid'] = ydata_df['avg_bid'].astype('float')
ydata_df['impr'] = ydata_df['impr'].astype('float')
ydata_df['clicks'] = ydata_df['clicks'].astype('float')
ydata_df['keyphrase'] = ydata_df.keyphrase.astype('str')

=========================================================================================================================

## Data Pruning Functions

In [10]:
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(3415)
    perm = np.random.permutation(df.index)
    m = len(df)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.ix[perm[:train_end]]
    validate = df.ix[perm[train_end:validate_end]]
    test = df.ix[perm[validate_end:]]
    return train, validate, test

In [11]:
# Remove datapoints with keyphrase having less than thresh competitors
def filter_competitive_keyph(ydata_df, thresh):
    keyph_dict = ydata_df['keyphrase'].value_counts().to_dict()
    comp_keyph = []
    for key in keyph_dict.keys():
        if keyph_dict[key] > thresh:
            comp_keyph.append(key)
    ydata_df = ydata_df[ydata_df['keyphrase'].isin(comp_keyph)]
    return ydata_df

In [12]:
# Remove datapoints with advertisers whose bid value remains fixed for a keyphrase
def filter_fixed_bidders(ydata_df, thresh):
    bidders = ydata_df['account_id'].unique()
    pulse_bidder = pd.DataFrame()
    for b in bidders:
        df = ydata_df[ydata_df['account_id'] == b]
        if df.avg_bid.std(axis=0) > thresh:
            pulse_bidder = pulse_bidder.append(df)
    return pulse_bidder

============================================================================================

# Filter Dataset

In [87]:
competitive_ydata_df = filter_competitive_keyph(ydata_df, 20)
ydata = filter_fixed_bidders(competitive_ydata_df, 0.01)
ydata_acc_kp = ydata.groupby(['account_id','keyphrase'])
ydata = ydata.sort_values(by='day')

=======================================================================================================

# Data Manipulation

In [None]:
Ydata = ydata
ygrouped = Ydata.groupby(['account_id','keyphrase', 'day'])
days = Ydata['day'].unique()
accounts = Ydata['account_id'].unique()
account_kp = defaultdict(list)
for adv in accounts:
    account_kp[adv] = Ydata[Ydata['account_id']==adv]['keyphrase'].unique()

In [None]:
ysqueeze = pd.DataFrame()
olist = []
for day in days:
    y_day = Ydata[(Ydata['day']==day)]
    for adv in account_kp.keys():
        y_adv = y_day[(y_day['account_id']==adv)]
        for kp in account_kp[adv]:
            data_pt = []
            y_ackp = y_adv[(y_adv['keyphrase']==kp)]
            if y_ackp.empty:
                continue
            data_pt.append(day)  # day
            data_pt.append(adv) # account_id
            data_pt.append(kp) # key_phrase
            data_pt.append(y_ackp['avg_bid'].mean()) # avg_bid mean
            data_pt.append(y_ackp['avg_bid'].max()) # avg_bid max
            data_pt.append(y_ackp['avg_bid'].min()) # avg_bid min
            data_pt.append(y_ackp['avg_bid'].std()) # avg_bid std
            data_pt.append(y_ackp['impr'].sum()) # impr sum
            data_pt.append(y_ackp['impr'].max()) # impr max
            data_pt.append(y_ackp['impr'].min()) # impr min
            data_pt.append(y_ackp['impr'].std()) # impr std
            data_pt.append(y_ackp['clicks'].sum()) # clicks sum
            data_pt.append(y_ackp['clicks'].max()) # clicks max
            data_pt.append(y_ackp['clicks'].min()) # clicks min
            data_pt.append(y_ackp['clicks'].std()) # clicks std
            data_pt.append((y_ackp['impr'].sum()) * (1.0/(y_ackp['impr']/y_ackp['rank']).sum())) # Weighted (inv) mean of rank
            data_pt.append(y_ackp['rank'].max()) # Max rank
            data_pt.append(y_ackp['rank'].min()) # Min rank
            data_pt.append(y_ackp['rank'].std()) # Rank std
            olist.append(data_pt)
ysqueeze = pd.DataFrame(olist, columns=['day', 'account_id', 'keyphrase', 'avg_bid', 'avg_bid_max', 
                                        'avg_bid_min', 'avg_bid_std','impr', 'impr_max', 'impr_min', 
                                        'impr_std', 'clicks', 'clicks_max', 'clicks_min','clicks_std', 
                                        'rank', 'rank_max', 'rank_min', 'rank_std'])


In [None]:
ysqueeze.to_pickle('ysqueezed_topKp.pkl')

=======================================================================================================================

## Feature Functions 

In [8]:
def avg_bid_lastmonth(adv, keyph, day, ydata_df):
    hist_adv = ydata_df[(ydata_df['account_id']==adv) & (ydata_df['keyphrase']==keyph) & (ydata_df['day']<day)]
    return hist_adv

In [9]:
def mean_adv_rank(row):
    if row['account_id'] in countAccKey and row['keyphrase'] in countAccKey[row['account_id']]:
        prevCount = countAccKey[row['account_id']][row['keyphrase']]
        prevMeanRank = meanRank[row['account_id']][row['keyphrase']]
    else:
        prevCount = 0
        countAccKey[row['account_id']][row['keyphrase']] = 0
        meanRank[row['account_id']][row['keyphrase']] = row['rank']
        prevMeanRank = 0
    countAccKey[row['account_id']][row['keyphrase']]+=1
    meanRank[row['account_id']][row['keyphrase']] = \
    (meanRank[row['account_id']][row['keyphrase']]*prevCount+row['rank'])/countAccKey[row['account_id']][row['keyphrase']]

    return prevMeanRank

In [10]:
def mean_adv_impr(row):
    if row['account_id'] in countAccKey and row['keyphrase'] in countAccKey[row['account_id']]:
        prevCount = countAccKey[row['account_id']][row['keyphrase']]
        countAccKey[row['account_id']][row['keyphrase']]+=1
    else:
        prevCount = 0
        countAccKey[row['account_id']][row['keyphrase']] = 1
        meanImpr[row['account_id']][row['keyphrase']] = row['impr']
    meanImpr[row['account_id']][row['keyphrase']] = \
    (meanImpr[row['account_id']][row['keyphrase']]*prevCount+row['impr'])/countAccKey[row['account_id']][row['keyphrase']]

    return meanImpr[row['account_id']][row['keyphrase']]

In [11]:
def mean_adv_click(row):
    if row['account_id'] in countAccKey and row['keyphrase'] in countAccKey[row['account_id']]:
        prevCount = countAccKey[row['account_id']][row['keyphrase']]
        countAccKey[row['account_id']][row['keyphrase']]+=1
    else:
        prevCount = 0
        countAccKey[row['account_id']][row['keyphrase']] = 1
        meanClick[row['account_id']][row['keyphrase']] = row['clicks']
    meanClick[row['account_id']][row['keyphrase']] = \
    (meanClick[row['account_id']][row['keyphrase']]*prevCount+row['clicks'])/countAccKey[row['account_id']][row['keyphrase']]

    return meanClick[row['account_id']][row['keyphrase']]

In [12]:
def mean_adv_bid(row):
    if row['account_id'] in countAccKey and row['keyphrase'] in countAccKey[row['account_id']]:
        prevCount = countAccKey[row['account_id']][row['keyphrase']]
        countAccKey[row['account_id']][row['keyphrase']]+=1
    else:
        prevCount = 0
        countAccKey[row['account_id']][row['keyphrase']] = 1
        meanBid[row['account_id']][row['keyphrase']] = row['avg_bid']
    meanBid[row['account_id']][row['keyphrase']] = \
    (meanBid[row['account_id']][row['keyphrase']]*prevCount+row['avg_bid'])/countAccKey[row['account_id']][row['keyphrase']]

    return meanBid[row['account_id']][row['keyphrase']]

In [13]:
def std_adv_rank(row):
    if row['account_id'] in countAccKey and row['keyphrase'] in countAccKey[row['account_id']]:
        prevCount = countAccKey[row['account_id']][row['keyphrase']]
        prevMeanRank = meanRank[row['account_id']][row['keyphrase']]
    else:
        prevCount = 0
        countAccKey[row['account_id']][row['keyphrase']] = 0
        meanRank[row['account_id']][row['keyphrase']] = row['rank']
        prevMeanRank = 0
    countAccKey[row['account_id']][row['keyphrase']]+=1
    sdRank = (((row['rank'] - prevMeanRank)**2)/countAccKey[row['account_id']][row['keyphrase']])**(1/2)
    meanRank[row['account_id']][row['keyphrase']] = \
    (meanRank[row['account_id']][row['keyphrase']]*prevCount+row['rank'])/countAccKey[row['account_id']][row['keyphrase']]
    return sdRank

In [14]:
def std_adv_impr(row):
    if row['account_id'] in countAccKey and row['keyphrase'] in countAccKey[row['account_id']]:
        prevCount = countAccKey[row['account_id']][row['keyphrase']]
        countAccKey[row['account_id']][row['keyphrase']]+=1
    else:
        prevCount = 0
        countAccKey[row['account_id']][row['keyphrase']] = 1
        meanImpr[row['account_id']][row['keyphrase']] = row['impr']
    meanImpr[row['account_id']][row['keyphrase']] = \
    (meanImpr[row['account_id']][row['keyphrase']]*prevCount+row['impr'])/countAccKey[row['account_id']][row['keyphrase']]
    sdImpr = (((row['impr'] - meanImpr[row['account_id']][row['keyphrase']])**2)/countAccKey[row['account_id']][row['keyphrase']])**(1/2)
    return sdImpr

In [15]:
def std_adv_click(row):
    if row['account_id'] in countAccKey and row['keyphrase'] in countAccKey[row['account_id']]:
        prevCount = countAccKey[row['account_id']][row['keyphrase']]
        countAccKey[row['account_id']][row['keyphrase']]+=1
    else:
        prevCount = 0
        countAccKey[row['account_id']][row['keyphrase']] = 1
        meanClick[row['account_id']][row['keyphrase']] = row['clicks']
    meanClick[row['account_id']][row['keyphrase']] = \
    (meanClick[row['account_id']][row['keyphrase']]*prevCount+row['clicks'])/countAccKey[row['account_id']][row['keyphrase']]
    sdClicks = (((row['clicks'] - meanClick[row['account_id']][row['keyphrase']])**2)/countAccKey[row['account_id']][row['keyphrase']])**(1/2)
    return sdClicks

In [16]:
def pulsing_strength(row):
    if row['account_id'] in countAccKey and row['keyphrase'] in countAccKey[row['account_id']]:
        prevCount = countAccKey[row['account_id']][row['keyphrase']]
        countAccKey[row['account_id']][row['keyphrase']]+=1
        prevBidMean = meanBid[row['account_id']][row['keyphrase']]
        prevBid = prevBidDict[row['account_id']][row['keyphrase']]
    else:
        prevCount = 0
        countAccKey[row['account_id']][row['keyphrase']] = 1
        meanBid[row['account_id']][row['keyphrase']] = row['avg_bid']
        prevBidMean = 0
        prevBid = 0
        
    prevBidDict[row['account_id']][row['keyphrase']] = row['avg_bid']
    meanBid[row['account_id']][row['keyphrase']] = \
    (meanBid[row['account_id']][row['keyphrase']]*prevCount+row['avg_bid'])/countAccKey[row['account_id']][row['keyphrase']]
    diff = row['avg_bid']- prevBid

    return (diff)*(1./meanBid[row['account_id']][row['keyphrase']])

In [17]:
def prev_rank(row):
    if row['account_id'] in prevRank and row['keyphrase'] in prevRank[row['account_id']]:
        prev_rank = prevRank[row['account_id']][row['keyphrase']]
    else:
        prev_rank = 0
    prevRank[row['account_id']][row['keyphrase']] = row['rank']
    return prev_rank

=======================================================================================================================

# Load Modified DataFrame

In [18]:
ydata = pd.read_pickle('ysqueezed_topKp_prot2.pkl')

# Extract Features 

In [19]:
# Mean rank of advertiser/keyphrase pair in history
from collections import defaultdict
meanRank = defaultdict(dict)
countAccKey = defaultdict(dict)
ydata['mean_rank'] = ydata.apply(lambda row: mean_adv_rank(row), axis=1)

In [20]:
# Mean impression of advertiser/keyphrase pair in history
meanImpr = defaultdict(dict)
countAccKey = defaultdict(dict)
ydata['mean_impr'] = ydata.apply(lambda row: mean_adv_impr(row), axis=1)

In [21]:
# Mean click count of advertiser/keyphrase pair in history
meanClick = defaultdict(dict)
countAccKey = defaultdict(dict)
ydata['mean_click'] = ydata.apply(lambda row: mean_adv_click(row), axis=1)

In [22]:
# Standard deviation in rank of advertiser/keyphrase pair in history
meanRank = defaultdict(dict)
countAccKey = defaultdict(dict)
ydata['std_rank'] = ydata.apply(lambda row: std_adv_rank(row), axis=1)

In [23]:
# Standard deviation in impression of advertiser/keyphrase pair in history
meanImpr = defaultdict(dict)
countAccKey = defaultdict(dict)
ydata['std_impr'] = ydata.apply(lambda row: std_adv_impr(row), axis=1)

In [24]:
# Standard deviation in click of advertiser/keyphrase pair in history
meanClick = defaultdict(dict)
countAccKey = defaultdict(dict)
ydata['std_click'] = ydata.apply(lambda row: std_adv_click(row), axis=1)

In [25]:
# Standard deviation in click of advertiser/keyphrase pair in history
meanBid = defaultdict(dict)
countAccKey = defaultdict(dict)
ydata['mean_bid'] = ydata.apply(lambda row: mean_adv_bid(row), axis=1)

In [26]:
# Pulsing Strength
prevBidDict = defaultdict(dict)
meanBid = defaultdict(dict)
countAccKey = defaultdict(dict)
ydata['pulsing_strength'] = ydata.apply(lambda row: pulsing_strength(row), axis=1)

In [27]:
# Prev Day Rank
prevRank = defaultdict(dict)
ydata['prev_rank'] = ydata.apply(lambda row: prev_rank(row), axis=1)

In [28]:
# Compute Click through rate
ydata['CTR'] = ydata.apply(lambda row: row['clicks']/row['impr'], axis=1)

In [224]:
# Number of keywords in keyphrase
ydata['num_keywords'] = ydata.apply(lambda row: len(row['keyphrase'].split()), axis=1)

In [225]:
# Number of competitors for keyphrase
kp_comp_count = ydata.groupby('keyphrase').count()
ydata['num_keyphrase_competitors'] = ydata.apply(lambda row: kp_comp_count['day'][row['keyphrase']], axis=1)

In [298]:
# Deviation from average bid for keyphrase
kp_comp_mean = ydata.groupby('keyphrase').mean()
ydata['keyphrase_bid_deviation'] = ydata.apply(lambda row: (row['avg_bid'] - kp_comp_mean['avg_bid'][row['keyphrase']])**2, 
                                               axis=1)

In [299]:
# Fill NaN with mean values
ydata['mean_rank'].fillna((ydata['mean_rank'].mean()), inplace=True)
ydata['mean_impr'].fillna((ydata['mean_impr'].mean()), inplace=True)
ydata['mean_click'].fillna((ydata['mean_click'].mean()), inplace=True)
ydata['std_rank'].fillna((ydata['std_rank'].mean()), inplace=True)
ydata['mean_bid'].fillna((ydata['mean_bid'].mean()), inplace=True)
ydata['pulsing_strength'].fillna((ydata['pulsing_strength'].mean()), inplace=True)
ydata['avg_bid_std'].fillna((ydata['avg_bid_std'].mean()), inplace=True)
ydata['impr_std'].fillna((ydata['impr_std'].mean()), inplace=True)
ydata['clicks_std'].fillna((ydata['clicks_std'].mean()), inplace=True)
ydata['rank_std'].fillna((ydata['rank_std'].mean()), inplace=True)

In [300]:
# Store Feature Model to pickle
ydata.to_pickle('kp_performance_df.pkl')

In [230]:
ydata = pd.read_pickle('kp_performance_df.pkl')

# Data Categorization

In [269]:
ydata['keyphrase'].unique()

aef4ee042bea9c6b fc4f04e287746c48                     34165
aef4ee042bea9c6b fc4f04e287746c48 324ac734097de5e4    20737
aef4ee042bea9c6b dbeabaf7b12face3 fc4f04e287746c48    13425
aef4ee042bea9c6b ac26bfe54a8a8f38 fc4f04e287746c48    12892
ef28801548f374f9 3db691494440189b                      7369
3db691494440189b                                       7127
88137a89999fc4c6 3db691494440189b                      6312
3db691494440189b a6a0665e425a2fbb                      5851
Name: keyphrase, dtype: int64

# Build Feature Vector

In [301]:
predictors = ["avg_bid", 
              "mean_rank", 
              "mean_impr", 
              "mean_click",
              "mean_bid",
              "pulsing_strength",
              "avg_bid_max",
              "avg_bid_min",
              "avg_bid_std",
              "prev_rank",
              "num_keywords",
              "num_keyphrase_competitors",
              "keyphrase_bid_deviation"
             ]

# Split Train, Validation and Test

In [302]:
ydata['bias'] = 1
Ydata = ydata.copy(deep=True)
Ydata['keyphrase_bid_deviation'] = np.log(Ydata['keyphrase_bid_deviation'])

In [303]:
ydata_train = Ydata[ydata['day'] < 100]
ydata_valid = Ydata[ydata['day'] >= 100]

# Build Model and Evaluate

In [284]:
alg_rank_arr = pd.DataFrame()
alg_rank_arr['index'] = np.arange(8)
alg_rank_arr.index = ydata['keyphrase'].unique()
rank_model = [LinearRegression()]*8

### Rank Prediction

In [304]:
alg_rank = LinearRegression()
# alg_rank = Lasso(alpha=0.1)
# alg_rank = ElasticNet(alpha=0.09, l1_ratio=0.7)
# alg_rank = TheilSenRegressor(random_state=42)
# alg_rank = HuberRegressor()
# alg_rank = RANSACRegressor(random_state=42)
# alg_rank = Ridge(alpha=1.0)
# alg_rank = RandomForestRegressor(random_state=0, n_estimators=100)
# alg_rank = ExtraTreesRegressor(n_estimators=10, max_features=10,random_state=0)
# alg_rank = KNeighborsRegressor()
# alg_rank = DecisionTreeRegressor()
# alg_rank = BaggingRegressor(DecisionTreeRegressor())
# rng = np.random.RandomState(1)
# alg_rank = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)
# alg_rank = GradientBoostingRegressor(n_estimators=500, max_depth=4, min_samples_split=2, learning_rate=0.01, loss= 'ls')

In [305]:
alg_rank.fit(ydata_train[predictors], ydata_train['rank'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [306]:
predictions_rank = alg_rank.predict(ydata_valid[predictors])

In [308]:
mae_rank = mean_absolute_error(ydata_valid['rank'], predictions_rank)
print ("Mean Absolute Error : " + str(mae_rank))

Mean Absolute Error : 2.64387138959


In [313]:
# for kp in ydata['keyphrase'].unique():
#     ykp_train =  ydata_train[ydata_train['keyphrase'] == kp]
#     ykp_valid =  ydata_valid[ydata_valid['keyphrase'] == kp]
#     alg_rank = GradientBoostingRegressor(n_estimators=500, max_depth=4, min_samples_split=2, learning_rate=0.01, loss= 'ls')
#     alg_rank.fit(ykp_train[predictors], ykp_train['rank'])
#     predictions_rank = alg_rank.predict(ykp_valid[predictors])
#     mae_rank = mean_absolute_error(ykp_valid['rank'], predictions_rank)
#     print ("Keyphrase : " + str(kp))
#     print ("Mean Absolute Error : " + str(mae_rank))
#     print "\n\n"

### Impression Prediction

In [206]:
# alg_impr = LinearRegression()
# alg_impr = Lasso(alpha=0.07)
# alg_impr = ElasticNet(alpha=0.09, l1_ratio=0.7)
# alg_impr = TheilSenRegressor(random_state=42)
# alg_impr = HuberRegressor()
# alg_impr = RANSACRegressor(random_state=42)
# alg_impr = Ridge(alpha=1.0)
# alg_impr = RandomForestRegressor(random_state=0, n_estimators=100)
# alg_impr = ExtraTreesRegressor(n_estimators=10, max_features=10,random_state=0)
# alg_impr = KNeighborsRegressor()
# alg_impr = DecisionTreeRegressor()
# alg_impr = BaggingRegressor(DecisionTreeRegressor())
# alg_impr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)
alg_impr = GradientBoostingRegressor(n_estimators=500, max_depth=4, min_samples_split=2, learning_rate=0.01, loss= 'ls')

In [207]:
alg_impr.fit(ydata_train[predictors], ydata_train['impr'])

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=500,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [267]:
predictions_impr = alg_impr.predict(ydata_valid[predictors])

In [None]:
mae_impr = mean_absolute_error(ydata_valid['impr'], predictions_impr)
print ("Mean Absolute Error : " + str(mae_impr))

### Click Prediction

In [210]:
# alg_click = LinearRegression()
# alg_click = Lasso(alpha=0.5)
# alg_click = ElasticNet(alpha=0.1, l1_ratio=0.7)
# alg_click = TheilSenRegressor(random_state=42)
# alg_click = HuberRegressor()
# alg_click = RANSACRegressor(random_state=42)
# alg_click = Ridge(alpha=1.0)
# alg_click = RandomForestRegressor(random_state=0, n_estimators=100)
# alg_click = ExtraTreesRegressor(n_estimators=10, max_features=10,random_state=0)
# alg_click = KNeighborsRegressor()
# alg_click = DecisionTreeRegressor()
# alg_click = BaggingRegressor(DecisionTreeRegressor())
# alg_click = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)
alg_click = GradientBoostingRegressor(n_estimators=500, max_depth=4, min_samples_split=2, learning_rate=0.01, loss= 'ls')

In [211]:
alg_click.fit(ydata_train[predictors], ydata_train['clicks'])

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=500,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [212]:
predictions_click = alg_click.predict(ydata_valid[predictors])

In [213]:
mae_click = mean_absolute_error(ydata_valid['clicks'], predictions_click)
print ("Mean Absolute Error : " + str(mae_click))

Mean Absolute Error : 5.07067341783


### CTR Prediction

In [309]:
alg_ctr = LinearRegression()
# alg_ctr = Lasso(alpha=0.25)
# alg_ctr = ElasticNet(alpha=0.08, l1_ratio=0.9)
# algt_ctr = TheilSenRegressor(random_state=42)
# alg_ctr = HuberRegressor()
# alg_ctr = RANSACRegressor(random_state=42)
# alg_ctr = Ridge(alpha=1.0)
# alg_ctr = RandomForestRegressor(random_state=0, n_estimators=100)
# alg_ctr = ExtraTreesRegressor(n_estimators=10, max_features=10,random_state=0)
# alg_ctr = KNeighborsRegressor()
# alg_ctr = DecisionTreeRegressor()
# alg_ctr = BaggingRegressor(DecisionTreeRegressor())
# alg_ctr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)
# alg_ctr = GradientBoostingRegressor(n_estimators=500, max_depth=4, min_samples_split=2, learning_rate=0.01, loss= 'ls')

In [310]:
alg_ctr.fit(ydata_train[predictors], ydata_train['CTR'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [311]:
predictions_ctr = alg_ctr.predict(ydata_valid[predictors])

In [312]:
mae_ctr = mean_absolute_error(ydata_valid['CTR'], predictions_ctr)
print ("Mean Absolute Error : " + str(mae_ctr))

Mean Absolute Error : 0.010934250553


======================================================================================================