In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import os 
from sklearn.metrics import confusion_matrix, auc
from sklearn import metrics
from xgboost import plot_importance




In [2]:
PATH = "input"

In [3]:
!ls {PATH}

sample_submission.csv	   test_supplement.csv	    train_sample.csv
sample_submission.csv.zip  test_supplement.csv.zip  train_sample.csv.zip
test.csv		   train.csv
test.csv.zip		   train.csv.zip


In [4]:
def dataPreProcessTime(df):
    df['click_time'] = pd.to_datetime(df['click_time']).dt.date
    df['click_time'] = df['click_time'].apply(lambda x: x.strftime('%d%H%M')).astype(int)
    
    return df

In [5]:
start_time = time.time()

print("reading data")
train = pd.read_csv(PATH+"/train.csv", skiprows=147403891, nrows=37500000) 
# nrows in "train.csv" is: 184903891
train.columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time', 'is_attributed']
test = pd.read_csv(PATH+"/test.csv")



reading data


  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
print('[{}] Finished to load data'.format(time.time() - start_time))

train = dataPreProcessTime(train)
test = dataPreProcessTime(test)

[92.60397005081177] Finished to load data


In [7]:
!ls

input		sub_xgb_hist_pos_weight.csv.gz	xgboost.ipynb
lightgbm.ipynb	tree.ipynb			xgboost_outout.ipynb


In [8]:
y = train['is_attributed']
train.drop(['ip', 'is_attributed', 'attributed_time'], axis=1, inplace=True)

sub = pd.DataFrame()
sub['click_id'] = test['click_id']
test.drop(['ip', 'click_id'], axis=1, inplace=True)

In [9]:
print("training set | data types: ")
print(train.info())
print("train size: ", len(train))
print("test size: ", len(test))

print('[{}] Start XGBoost Training'.format(time.time() - start_time))

training set | data types: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37499999 entries, 0 to 37499998
Data columns (total 5 columns):
app           int64
device        int64
os            int64
channel       int64
click_time    int64
dtypes: int64(5)
memory usage: 1.4 GB
None
train size:  37499999
test size:  18790469
[267.90895986557007] Start XGBoost Training


In [10]:
params = {'eta': 0.6, 
          'tree_method': "hist",      # Fast histogram optimized approximate greedy algorithm. 
          'grow_policy': "lossguide", # split at nodes with highest loss change
          'max_leaves': 1400,         # Maximum number of nodes to be added. (for lossguide grow policy | custom number: 2^max_depth(~10.5))
          'max_depth': 0,             # 0 means no limit (useful only for depth wise grow policy)
          'subsample': 0.9,           
          'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
          'min_child_weight':0,       # The larger, the more conservative the algorithm will be
          'alpha':4,
          'objective': 'binary:logistic', 
      #    'scale_pos_weight':9,       # because training data is extremely unbalanced 
          'eval_metric': 'auc', 
          'nthread':8,
          'random_state': 84, 
          'silent': True}

          
x1, x2, y1, y2 = train_test_split(train, y, test_size=0.1, random_state=84)

# watch list to observe the change in error in training and validation data
watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]



In [11]:
x1.head()

Unnamed: 0,app,device,os,channel,click_time
3876180,3,1,6,280,90000
36413389,18,1,10,107,90000
18997479,9,1,36,127,90000
35793788,3,1,53,130,90000
926462,15,1,13,480,90000


In [14]:
params = {
          'objective': 'binary:logistic', 
          'eval_metric': 'auc', 
          'random_state': 84, 
          'silent': True}



In [33]:
?xgb.train

In [15]:
model = xgb.train(params, xgb.DMatrix(x1, y1), num_boost_round=50, evals=watchlist, maximize=True, 
                  early_stopping_rounds = 10, verbose_eval=1)

print('[{}] Finish XGBoost Training'.format(time.time() - start_time))



[00:06:44] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
[0]	train-auc:0.907593	valid-auc:0.90671
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 10 rounds.
[1]	train-auc:0.934969	valid-auc:0.935963
[2]	train-auc:0.935024	valid-auc:0.936117
[3]	train-auc:0.935174	valid-auc:0.936267
[4]	train-auc:0.935246	valid-auc:0.936346
[5]	train-auc:0.935386	valid-auc:0.936472
[6]	train-auc:0.935421	valid-auc:0.936492
[7]	train-auc:0.93574	valid-auc:0.936735
[8]	train-auc:0.935745	valid-auc:0.936732
[9]	train-auc:0.935862	valid-auc:0.936811
[10]	train-auc:0.936059	valid-auc:0.936956
[11]	train-auc:0.936202	valid-auc:0.937222
[12]	train-auc:0.949378	valid-auc:0.949504
[13]	train-auc:0.950767	valid-auc:0.950844
[14]	train-auc:0.951316	valid-auc:0.951342
[15]	train-auc:0.951931	valid-auc:0.951996
[16]	trai

In [16]:
sub['is_attributed'] = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
sub.to_csv('sub_xgb_hist_pos_weight.csv.gz',index=False,compression="gzip")

In [17]:
len(train)

37499999

In [18]:
len(train.columns)

5

In [19]:
train.shape

(37499999, 5)

In [20]:
train.head()

Unnamed: 0,app,device,os,channel,click_time
0,3,1,18,130,90000
1,10,1,9,377,90000
2,18,1,17,121,90000
3,12,1,17,497,90000
4,3,1,13,280,90000


In [21]:
y.value_counts()

0    37405507
1       94492
Name: is_attributed, dtype: int64

In [22]:
?model.predict

In [23]:
y_pred = model.predict(xgb.DMatrix(x2), ntree_limit=model.best_ntree_limit)

In [24]:
y_pred

array([ 0.00021744,  0.13953386,  0.00043834, ...,  0.00065274,
        0.00036475,  0.00021646], dtype=float32)

In [25]:
cm = confusion_matrix(y2, (y_pred>0.5))
cm

array([[3739954,     688],
       [   7336,    2022]])

In [26]:
(3721139+6527)/ (len(y2))

0.9940442666666667

In [27]:
(3721139)/ (len(y2))

0.9923037333333333

In [28]:
baseline_predictions = np.zeros(len(y2))

In [29]:
fpr, tpr, thresholds = metrics.roc_curve(y2, baseline_predictions)
metrics.auc(fpr, tpr)


0.5

In [30]:
fpr, tpr, thresholds = metrics.roc_curve(y2, y_pred)
metrics.auc(fpr, tpr)

0.95988285729143019

In [None]:
?metrics.roc_curve

In [None]:
plot_importance(model)

In [None]:
?plot_importance

In [None]:
model.get_split_value_histogram()

In [None]:
import matplotlib.pyplot as plt


In [None]:
fig, ax = plt.subplots(figsize=(800, 800))
xgb.plot_tree(model, num_trees=4, ax=ax)
plt.show()