# CTR Prediction

	https://www.kaggle.com/c/avazu-ctr-prediction/data

## File descriptions
**train** - Training set. 10 days of click-through data, ordered chronologically. Non-clicks and clicks are subsampled according to different strategies. 

	https://www.kaggle.com/c/avazu-ctr-prediction/download/train.gz

**test** - Test set. 1 day of ads to for testing your model predictions. 

	https://www.kaggle.com/c/avazu-ctr-prediction/download/test.gz

**sampleSubmission.csv** - Sample submission file in the correct format, corresponds to the All-0.5 Benchmark. 

	https://www.kaggle.com/c/avazu-ctr-prediction/download/sampleSubmission.gz

## Data fields
id: ad identifier
click: 0/1 for non-click/click
hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.

C1 -- anonymized categorical variable, banner_pos, site_id, site_domain, site_category, app_id, app_domain, app_category, device_id, device_ip, device_model, device_type, device_conn_type, C14-C21 -- anonymized categorical variables

# Load Data

In [1]:
import pandas as pd


# Initial setup
train_filename = "../../data/avazu_train.csv"
test_filename = "test.csv"
submission_filename = "submit.csv"

training_set = pd.read_csv(train_filename)

# Explore Data

In [2]:
training_set.head(10)

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1000009418151094273,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,10020838610941394934,0,14102100,1005,1,43d6df75,27e3c518,28905ebd,ecad2386,7801e8d9,...,1,0,15708,320,50,1722,0,35,100084,79
2,10045110368483144711,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,...,1,0,19743,320,50,2264,3,427,100000,61
3,10064943621928487558,0,14102100,1005,0,543a539e,c7ca3108,3e814130,ecad2386,7801e8d9,...,1,0,20366,320,50,2333,0,39,-1,157
4,10086824143525178922,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,e2fcccd2,5c5a694b,...,1,0,20633,320,50,2374,3,39,-1,23
5,10105704546041534334,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15703,320,50,1722,0,35,100083,79
6,10128683006009564891,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,2f6efcf2,813f3323,...,1,0,21611,320,50,2480,3,297,100111,61
7,10150890087768749283,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15708,320,50,1722,0,35,-1,79
8,10173911685056615297,0,14102100,1005,0,3a66a5a5,9e328a4d,f028772b,ecad2386,7801e8d9,...,1,0,19666,300,250,2253,2,303,100026,52
9,10196468059344865330,1,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,3c4b944d,2347f47a,...,1,0,20751,320,50,1895,0,681,100028,101


In [3]:
training_set.describe()

Unnamed: 0,id,click,hour,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
count,216692.0,216692.0,216692.0,216692.0,216692.0,216692.0,216692.0,216692.0,216692.0,216692.0,216692.0,216692.0,216692.0,216692.0,216692.0
mean,9.221064e+18,0.171515,14102460.0,1004.96957,0.287699,1.016415,0.328291,18406.131283,318.785557,60.349805,2058.21495,1.421414,222.876405,53566.933592,80.919383
std,5.330037e+18,0.37696,242.5355,1.106036,0.510004,0.532424,0.855927,4995.659636,21.973595,48.003972,610.473761,1.322261,351.320225,49933.040119,66.224921
min,31200260000000.0,0.0,14102100.0,1001.0,0.0,0.0,0.0,375.0,120.0,20.0,112.0,0.0,33.0,-1.0,13.0
25%,4.603496e+18,0.0,14102220.0,1005.0,0.0,1.0,0.0,16615.0,320.0,50.0,1800.0,0.0,35.0,-1.0,23.0
50%,9.217328e+18,0.0,14102420.0,1005.0,0.0,1.0,0.0,20108.0,320.0,50.0,2281.0,2.0,39.0,100051.0,61.0
75%,1.383939e+19,0.0,14102700.0,1005.0,1.0,1.0,0.0,21767.0,320.0,50.0,2502.0,3.0,167.0,100084.0,95.0
max,1.844674e+19,1.0,14102900.0,1012.0,7.0,5.0,5.0,23507.0,1024.0,1024.0,2688.0,3.0,1839.0,100248.0,221.0


In [5]:
# id: ad identifier
# click: 0/1 for non-click/click
# hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
# C1 -- anonymized categorical variable
# banner_pos
# site_id
# site_domain
# site_category
# app_id
# app_domain
# app_category
# device_id
# device_ip
# device_model
# device_type
# device_conn_type
# C14-C21 -- anonymized categorical variables
from sklearn.externals import joblib
from sklearn.model_selection import cross_validate
#from sklearn import module_selection
from matplotlib import pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from utils import load_df

In [6]:
# 结果衡量
def print_metrics(true_values, predicted_values):
    print("Accuracy: ", metrics.accuracy_score(true_values, predicted_values))
    print("AUC: ", metrics.roc_auc_score(true_values, predicted_values))
    print("Confusion Matrix: ", + metrics.confusion_matrix(true_values, predicted_values))
    print(metrics.classification_report(true_values, predicted_values))

# 拟合分类器
def classify(classifier_class, train_input, train_targets):
    classifier_object = classifier_class()
    classifier_object.fit(train_input, train_targets)
    return classifier_object

# 模型存储
def save_model(clf):
    joblib.dump(clf, 'classifier.pkl')

In [7]:
train_data = load_df('../../data/avazu_train.csv').values

In [8]:
train_data[:,:]

array([[       0, 14102100,     1005, ...,       35,       -1,       79],
       [       0, 14102100,     1005, ...,       35,   100084,       79],
       [       0, 14102100,     1005, ...,      427,   100000,       61],
       ...,
       [       0, 14102901,     1005, ...,      167,       -1,      221],
       [       1, 14102901,     1005, ...,       35,   100084,       79],
       [       0, 14102901,     1002, ...,     1327,   100188,       52]])

In [9]:
# 训练和存储模型
X_train, X_test, y_train, y_test = train_test_split(train_data[0::, 1::], train_data[0::, 0],
                                                    test_size=0.3, random_state=0)

classifier = classify(LogisticRegression, X_train, y_train)
predictions = classifier.predict(X_test)
print_metrics(y_test, predictions)
save_model(classifier)



Accuracy:  0.8296363524489294
AUC:  0.5
Confusion Matrix:  [[53933     0]
 [11075     0]]
              precision    recall  f1-score   support

           0       0.83      1.00      0.91     53933
           1       0.00      0.00      0.00     11075

   micro avg       0.83      0.83      0.83     65008
   macro avg       0.41      0.50      0.45     65008
weighted avg       0.69      0.83      0.75     65008



  'precision', 'predicted', average, warn_for)


In [10]:
# 按照指定的格式生成结果
def create_submission(ids, predictions, filename='submission.csv'):
    submissions = np.concatenate((ids.reshape(len(ids), 1), predictions.reshape(len(predictions), 1)), axis=1)
    df = DataFrame(submissions)
    df.to_csv(filename, header=['id', 'click'], index=False)

In [11]:
import numpy as np
from pandas import DataFrame

classifier = joblib.load('classifier.pkl')
#test_data_df = load_df('test.csv', training=False)
#ids = test_data_df.values[0:, 0]
#predictions = classifier.predict(test_data_df.values[0:, 1:])
#create_submission(ids, predictions)