In [7]:

import numpy as np
import pandas as pd
from sklearn import preprocessing, metrics
import lightgbm as lgb

clicks = pd.read_parquet('baseline_data.pqt')

def get_data_splits(dataframe, valid_fraction=0.1):
    """Splits a dataframe into train, validation, and test sets.

    First, orders by the column 'click_time'. Set the size of the 
    validation and test sets with the valid_fraction keyword argument.
    """

    dataframe = dataframe.sort_values('click_time')
    valid_rows = int(len(dataframe) * valid_fraction)
    train = dataframe[:-valid_rows * 2]
    # valid size == test size, last two sections of the data
    valid = dataframe[-valid_rows * 2:-valid_rows]
    test = dataframe[-valid_rows:]
    
    return train, valid, test

def train_model(train, valid, test=None, feature_cols=None):
    if feature_cols is None:
        feature_cols = train.columns.drop(['click_time', 'attributed_time',
                                           'is_attributed'])
    dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed'])
    
    param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
    num_round = 1000
    bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], 
                    early_stopping_rounds=20, verbose_eval=False)
    
    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['is_attributed'], valid_pred)
    print(f"Validation AUC score: {valid_score}")
    
    if test is not None: 
        test_pred = bst.predict(test[feature_cols])
        test_score = metrics.roc_auc_score(test['is_attributed'], test_pred)
        return bst, valid_score, test_score
    else:
        return bst, valid_score
    
print("Baseline model")
train, valid, test = get_data_splits(clicks)
_ = train_model(train, valid)

Baseline model
[LightGBM] [Info] Number of positive: 363974, number of negative: 1476475
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1039
[LightGBM] [Info] Number of data points in the train set: 1840449, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197764 -> initscore=-1.400330
[LightGBM] [Info] Start training from score -1.400330
Validation AUC score: 0.9621485832146418


In [9]:
import category_encoders as ce

cat_features = ['ip', 'app', 'device', 'os', 'channel']
train, valid, test = get_data_splits(clicks)

count_enc = ce.CountEncoder(cols=cat_features)



count_enc.fit(train[cat_features])


train_encoded = train.join(count_enc.transform(train[cat_features]).add_suffix('_count'))
valid_encoded = valid.join(count_enc.transform(valid[cat_features]).add_suffix('_count'))

_ = train_model(train_encoded, valid_encoded)

  elif pd.api.types.is_categorical(cols):


[LightGBM] [Info] Number of positive: 363974, number of negative: 1476475
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1878
[LightGBM] [Info] Number of data points in the train set: 1840449, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197764 -> initscore=-1.400330
[LightGBM] [Info] Start training from score -1.400330
Validation AUC score: 0.9651119813956208


In [10]:
target_enc = ce.TargetEncoder(cols=cat_features)

target_enc.fit(train[cat_features], train['is_attributed'])

train_encoded = train.join(target_enc.transform(train[cat_features]).add_suffix('_target'))
valid_encoded = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_target'))

  elif pd.api.types.is_categorical(cols):


In [12]:
cat_features = ['app', 'device', 'os', 'channel']

train, valid, test = get_data_splits(clicks)

cb_enc = ce.CatBoostEncoder(cols=cat_features, random_state=7)

cb_enc.fit(train[cat_features], train['is_attributed'])


train_encoded = train.join(cb_enc.transform(train[cat_features]).add_suffix('_cb'))
valid_encoded = valid.join(cb_enc.transform(valid[cat_features]).add_suffix('_cb'))
_ = train_model(train_encoded, valid_encoded)

  elif pd.api.types.is_categorical(cols):


[LightGBM] [Info] Number of positive: 363974, number of negative: 1476475
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1731
[LightGBM] [Info] Number of data points in the train set: 1840449, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197764 -> initscore=-1.400330
[LightGBM] [Info] Start training from score -1.400330
Validation AUC score: 0.9628259061746094
