In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import os 
from sklearn.metrics import confusion_matrix, auc
from sklearn import metrics
from xgboost import plot_importance




In [2]:
PATH = "input"

In [3]:
!ls {PATH}

sample_submission.csv	   test_supplement.csv	    train_sample.csv
sample_submission.csv.zip  test_supplement.csv.zip  train_sample.csv.zip
test.csv		   train.csv
test.csv.zip		   train.csv.zip


In [4]:
def dataPreProcessTime(df):
    df['click_time'] = pd.to_datetime(df['click_time']).dt.date
    df['click_time'] = df['click_time'].apply(lambda x: x.strftime('%d%H%M')).astype(int)
    
    return df

In [5]:
start_time = time.time()

print("reading data")
train = pd.read_csv(PATH+"/train.csv", skiprows=147403891, nrows=37500000) 
# nrows in "train.csv" is: 184903891
train.columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time', 'is_attributed']
test = pd.read_csv(PATH+"/test.csv")



reading data


  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
print('[{}] Finished to load data'.format(time.time() - start_time))

train = dataPreProcessTime(train)
test = dataPreProcessTime(test)

[88.64120650291443] Finished to load data


In [7]:
!ls

input		sub_xgb_hist_pos_weight.csv.gz	xgboost.ipynb
lightgbm.ipynb	tree.ipynb			xgboost_outout.ipynb


In [8]:
y = train['is_attributed']
train.drop(['ip', 'is_attributed', 'attributed_time'], axis=1, inplace=True)

sub = pd.DataFrame()
sub['click_id'] = test['click_id']
test.drop(['ip', 'click_id'], axis=1, inplace=True)

In [9]:
print("training set | data types: ")
print(train.info())
print("train size: ", len(train))
print("test size: ", len(test))

print('[{}] Start XGBoost Training'.format(time.time() - start_time))

training set | data types: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37499999 entries, 0 to 37499998
Data columns (total 5 columns):
app           int64
device        int64
os            int64
channel       int64
click_time    int64
dtypes: int64(5)
memory usage: 1.4 GB
None
train size:  37499999
test size:  18790469
[262.17932391166687] Start XGBoost Training


In [None]:
params = {'eta': 0.6, 
          'tree_method': "hist",      # Fast histogram optimized approximate greedy algorithm. 
          'grow_policy': "lossguide", # split at nodes with highest loss change
          'max_leaves': 1400,         # Maximum number of nodes to be added. (for lossguide grow policy | custom number: 2^max_depth(~10.5))
          'max_depth': 0,             # 0 means no limit (useful only for depth wise grow policy)
          'subsample': 0.9,           
          'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
          'min_child_weight':0,       # The larger, the more conservative the algorithm will be
          'alpha':4,
          'objective': 'binary:logistic', 
      #    'scale_pos_weight':9,       # because training data is extremely unbalanced 
          'eval_metric': 'auc', 
          'nthread':8,
          'random_state': 84, 
          'silent': True}

          
x1, x2, y1, y2 = train_test_split(train, y, test_size=0.1, random_state=84)

# watch list to observe the change in error in training and validation data
watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]



In [None]:
x1.head()

In [None]:
params = {
          'objective': 'binary:logistic', 
          'eval_metric': 'auc', 
          'random_state': 84, 
          'silent': True}



In [None]:
?xgb.train

In [None]:
model = xgb.train(params, xgb.DMatrix(x1, y1), num_boost_round=50, evals=watchlist, maximize=True, 
                  early_stopping_rounds = 10, verbose_eval=1)

print('[{}] Finish XGBoost Training'.format(time.time() - start_time))



In [None]:
sub['is_attributed'] = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
sub.to_csv('sub_xgb_hist_pos_weight.csv.gz',index=False,compression="gzip")

In [None]:
len(train)

In [None]:
len(train.columns)

In [None]:
train.shape

In [None]:
train.head()

In [None]:
y.value_counts()

In [None]:
?model.predict

In [None]:
y_pred = model.predict(xgb.DMatrix(x2), ntree_limit=model.best_ntree_limit)

In [None]:
y_pred

In [None]:
cm = confusion_matrix(y2, (y_pred>0.5))
cm

In [None]:
(3721139+6527)/ (len(y2))

In [None]:
(3721139)/ (len(y2))

In [None]:
baseline_predictions = np.zeros(len(y2))

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y2, baseline_predictions)
metrics.auc(fpr, tpr)


In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y2, y_pred)
metrics.auc(fpr, tpr)

In [None]:
?metrics.roc_curve

In [None]:
plot_importance(model)

In [None]:
?plot_importance

In [None]:
model.get_split_value_histogram()

In [None]:
import matplotlib.pyplot as plt


In [None]:
fig, ax = plt.subplots(figsize=(800, 800))
xgb.plot_tree(model, num_trees=4, ax=ax)
plt.show()