In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import datetime as dt

from sklearn import preprocessing, model_selection, metrics

import lightgbm as lgb

import matplotlib.pyplot as plt
%matplotlib inline

from pathlib import Path
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
pd.options.display.max_columns = 1000

In [2]:
DATA_PATH = Path("data/")

In [3]:
train = pd.read_csv(DATA_PATH/"train.csv", parse_dates=["impression_time"])
test = pd.read_csv(DATA_PATH/"test.csv", parse_dates=["impression_time"])

In [4]:
train["is_train"] = 1
test["is_train"] = 0

panel = pd.concat([train, test], sort=False, ignore_index=True)

In [5]:
panel.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click,is_train
0,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,old,0,0.0,1
1,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,latest,1,1.0,1
2,70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,intermediate,1,0.0,1
3,8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,latest,1,0.0,1
4,182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,latest,0,0.0,1


In [6]:
os_version_mapper = {"latest": 2, "intermediate": 1, "old": 0}
panel["os_version"] = panel["os_version"].map(os_version_mapper)

In [7]:
panel["day"] = panel["impression_time"].dt.day
panel["dayofweek"] = panel["impression_time"].dt.dayofweek
panel["hour"] = panel["impression_time"].dt.hour

In [8]:
for col in ["user_id", "app_code", 
            ["user_id", "app_code"], ["app_code", "os_version"]]:
    if not isinstance(col, list):
        col = [col]
    col_name = "_".join(col)
    all_df = panel[["impression_id"] + col].copy()
    gdf = all_df.groupby(col)["impression_id"].count().reset_index()
    gdf.columns = col + [col_name+"_count"]
    panel = pd.merge(panel, gdf, on=col, how="left")

In [9]:
panel.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click,is_train,day,dayofweek,hour,user_id_count,app_code_count,user_id_app_code_count,app_code_os_version_count
0,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,0,0,0.0,1,15,3,0,3,535,3,228
1,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,2,1,1.0,1,15,3,0,61,538,50,226
2,70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,1,1,0.0,1,15,3,0,13,418,13,145
3,8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,2,1,0.0,1,15,3,0,4,5529,4,2671
4,182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,2,0,0.0,1,15,3,0,16,1294,16,734


In [10]:
columns_for_model = np.setdiff1d(panel.columns.values, ["impression_id",
                                                        "impression_time",
                                                        "is_click",
                                                        "is_train"]).tolist()
train_X = panel.loc[panel["is_train"] == 1, columns_for_model + ["is_click"]].reset_index(drop=True)
test_X = panel.loc[panel["is_train"] == 0, ["impression_id"]+columns_for_model].reset_index(drop=True)

train_y = train_X["is_click"].values

train_X = train_X.drop(["is_click"], axis=1)
test_ids = test_X["impression_id"].values
test_X = test_X.drop(["impression_id"], axis=1)
print(train_X.shape, test_X.shape)

(237609, 11) (90675, 11)


In [11]:
def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, dep=8, seed=0, data_leaf=511, hessian_leaf=50):  
    params = {}
    params["objective"] = "binary"
    params['metric'] = 'auc'
#     params["max_depth"] = dep
    params["num_leaves"] = 31
    params["min_data_in_leaf"] = data_leaf
    params["min_sum_hessian_in_leaf"] = hessian_leaf
    params["learning_rate"] = 0.01
    params["bagging_fraction"] = 0.8
    params["feature_fraction"] = 0.8
    params["feature_fraction_seed"] = seed
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed
    params["lambda_l2"] = 0.95
    params["lambda_l1"] = 0.95
    params["verbosity"] = -1
    num_rounds = 20000

    plst = list(params.items())
    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=200, verbose_eval=500)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)

    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)
        print(loss)
        return model, loss, pred_test_y, pred_test_y2
    else:
        return model, loss, pred_test_y, pred_test_y2

In [12]:
print("Building model..")
cv_scores = []
pred_test_full = 0
pred_train = np.zeros(train_X.shape[0])
n_splits = 3
kf = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=7988)
model_name = "lgb"
for dev_index, val_index in kf.split(train_X, train_y):
    dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]

    pred_val = 0
    pred_test = 0
    n_models = 0.

    model, loss, pred_v, pred_t = runLGB(dev_X, dev_y, val_X, val_y, test_X, seed=2019)
    pred_val += pred_v
    pred_test += pred_t
    n_models += 1
    
#     model, loss, pred_v, pred_t = runLGB(dev_X, dev_y, val_X, val_y, test_X, data_leaf=450, hessian_leaf=30, seed=9873)
#     pred_val += pred_v
#     pred_test += pred_t
#     n_models += 1
    
#     model, loss, pred_v, pred_t = runLGB(dev_X, dev_y, val_X, val_y, test_X, data_leaf=600, hessian_leaf=70, seed=4568)
#     pred_val += pred_v
#     pred_test += pred_t
#     n_models += 1
    
    pred_val /= n_models
    pred_test /= n_models
    
    loss = metrics.roc_auc_score(val_y, pred_val)
        
    pred_train[val_index] = pred_val
    pred_test_full += pred_test / n_splits
    cv_scores.append(loss)
#     break
print(np.mean(cv_scores))

Building model..
Training until validation scores don't improve for 200 rounds.
[500]	valid_0's auc: 0.698
[1000]	valid_0's auc: 0.700417
Early stopping, best iteration is:
[974]	valid_0's auc: 0.700604
0.7006043899702853
Training until validation scores don't improve for 200 rounds.
[500]	valid_0's auc: 0.695455
[1000]	valid_0's auc: 0.697236
[1500]	valid_0's auc: 0.698673
[2000]	valid_0's auc: 0.699716
Early stopping, best iteration is:
[2185]	valid_0's auc: 0.700109
0.7001091295303701
Training until validation scores don't improve for 200 rounds.
[500]	valid_0's auc: 0.69372
[1000]	valid_0's auc: 0.695909
[1500]	valid_0's auc: 0.697625
[2000]	valid_0's auc: 0.698348
Early stopping, best iteration is:
[1985]	valid_0's auc: 0.698418
0.6984176119415777
0.699710377147411
