In [20]:
import sys
import os
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
import pickle
import datetime

import pandas as pd
import numpy as np
import time
import pyarrow.parquet as pq
from scipy.sparse import csr_matrix, coo_matrix, vstack, load_npz
import implicit
import bisect
import sklearn.metrics as m
from lightgbm import LGBMClassifier, early_stopping
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
import optuna
from optuna.integration import LightGBMPruningCallback

#import vaex

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
LOCAL_DATA_PATH = 'context_data'
SPLIT_SEED = 42
DATA_FILE = 'competition_data_final_pqt'
TARGET_FILE = 'public_train.pqt'
SUBMISSION_FILE = 'submit_2.pqt'

In [4]:
id_to_submit = pq.read_table(f'../{LOCAL_DATA_PATH}/{SUBMISSION_FILE}').to_pandas()
tgt = pq.read_table(f'../{LOCAL_DATA_PATH}/{TARGET_FILE}').to_pandas()

In [5]:
#train_df = pd.DataFrame(index=tgt['is_male'][(tgt.is_male == '0') | (tgt.is_male == '1')].index)
y_true = tgt['is_male'][(tgt.is_male == '0') | (tgt.is_male == '1')].values.astype(np.int8)

In [6]:
train_df = pd.read_feather('../utils/v121_gender_train.feather', index_col='user_id')
test_df = pd.read_feather('../utils/v121_gender_test.feather', index_col='user_id')
feat_df = pd.read_csv('../utils/feat_gen_df3.csv', index_col='user_id')

In [7]:
train_df['os'] = train_df['os'].map({'iOS': 0, 'Android': 1})
test_df['os'] = test_df['os'].map({'iOS': 0, 'Android': 1})

cat_df = feat_df[['region_name', 'city_name', 'company', 'model']]
cat_feat = np.stack([cat_df[col].astype('category').cat.codes.values for col in cat_df]).T
train_df[['region_name', 'city_name', 'company', 'model']] = cat_feat[train_df.index]
test_df[['region_name', 'city_name', 'company', 'model']] = cat_feat[test_df.index]

In [15]:
%%time

X = train_df
y = y_true

models_0 = []
params = {'boosting_type': 'dart',
          'drop_rate': 0.1,
          'max_drop': 50,
          'skip_drop': 0.5,
          'objective': 'binary',
          'num_iterations': 12000,
          'max_depth': 2,
          'min_data_in_leaf': 20,
          'bagging_fraction': 1.0,
          'feature_fraction': 1.0,
          'feature_fraction_bynode': 1.0,
          'lambda_l1': 0.0,
          'lambda_l2': 0.5,
          'learning_rate': 0.2,
          'num_leaves': 31,
          'device_type': "gpu",
          'num_threads': 12,
          'early_stopping_rounds': 500,
          'verbose': 200, # output to stdout info about training process every 200 iterations
          'seed': 722
         }

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
for n, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    print(f'______fold {n+1}______')
    lgb_train = lgb.Dataset(X.iloc[train_idx], y[train_idx])
    lgb_eval = lgb.Dataset(X.iloc[test_idx], y[test_idx], reference=lgb_train)
    clf = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval
                )
    models_0.append(clf)
    

______fold 1______
[LightGBM] [Info] Number of positive: 108265, number of negative: 103195
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.992672
[LightGBM] [Info] Total Bins 121355
[LightGBM] [Info] Number of data points in the train set: 211460, number of used features: 2524
[LightGBM] [Info] Using GPU Device: gfx902, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 355 dense feature groups (71.79 MB) transferred to GPU in 0.023215 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511988 -> initscore=0.047962
[LightGBM] [Info] Start training from score 0.047962
[LightGBM] [Debug] Trained a tree with leaves = 4 and depth = 2
[1]	valid_0's binary_logloss: 0.679768
[LightGBM] [Debug] Trained a tree with leaves = 4 and depth = 2
[2

[LightGBM] [Info] Number of positive: 108266, number of negative: 103195
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.992655
[LightGBM] [Info] Total Bins 121522
[LightGBM] [Info] Number of data points in the train set: 211461, number of used features: 2524
[LightGBM] [Info] Using GPU Device: gfx902, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 355 dense feature groups (71.79 MB) transferred to GPU in 0.025610 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511990 -> initscore=0.047971
[LightGBM] [Info] Start training from score 0.047971
[LightGBM] [Debug] Trained a tree with leaves = 4 and depth = 2
[1]	valid_0's binary_logloss: 0.67983
[LightGBM] [Debug] Trained a tree with leaves = 4 and depth = 2
[2]	valid_0's binary_l

In [18]:
clf.predict(X.iloc[test_idx])

array([0.48258722, 0.62648725, 0.48258722, ..., 0.64951426, 0.79862766,
       0.53881244])

In [26]:
#dtrain = lgb.Dataset(train_df, label=y_true)


def objective(trial, X, y):
    params = {'boosting_type': 'dart',
              'drop_rate': trial.suggest_float('drop_rate', 0.03, 0.3),
              'max_drop': trial.suggest_int('max_drop', 5, 100),
              'skip_drop': trial.suggest_float('skip_drop', 0.2, 0.8),
              'objective': 'binary',
              'num_iterations': 10000,
              'max_depth': trial.suggest_int('max_depth', 2, 5),
              'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
              'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
              'bagging_freq': trial.suggest_int('bagging_freq', 0, 15),
              'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
              'feature_fraction_bynode': trial.suggest_uniform('feature_fraction_bynode', 0.1, 1.0),
              'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
              'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
              'learning_rate': trial.suggest_loguniform('learning_rate', 0.02, 0.1),
              'num_leaves': trial.suggest_int('num_leaves', 2, 512),
              'device_type': "gpu",
              'num_threads': 12,
              'verbose': -1, # output to stdout info about training process every 200 iterations
              'seed': 722
         }
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    cv_scores = np.empty(5)
    for n, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        lgb_train = lgb.Dataset(X.iloc[train_idx], y[train_idx])
        lgb_eval = lgb.Dataset(X.iloc[test_idx], y[test_idx], reference=lgb_train)
        clf = lgb.train(params,
                    lgb_train,
                    valid_sets=lgb_eval,
                    callbacks=[
                    LightGBMPruningCallback(trial, "binary_logloss")
            ]
                    )
        preds = clf.predict(X.iloc[test_idx])
        cv_scores[n] = m.log_loss(y[test_idx], preds)
    

    # Return metric of interest
    return np.mean(cv_scores)    

In [27]:
%%time

optuna.logging.set_verbosity(optuna.logging.INFO) 

study = optuna.create_study(direction='minimize')  
func = lambda trial: objective(trial, train_df, y_true)
study.optimize(func, n_trials=1000) 

[32m[I 2023-03-19 20:17:18,756][0m A new study created in memory with name: no-name-865fbc1e-0564-4fae-a75f-e9692aec9de3[0m
[32m[I 2023-03-19 21:01:32,195][0m Trial 0 finished with value: 0.47197195530033087 and parameters: {'drop_rate': 0.23222124808984007, 'max_drop': 57, 'skip_drop': 0.33076957815092567, 'max_depth': 2, 'min_child_samples': 71, 'bagging_fraction': 0.302425202243268, 'bagging_freq': 11, 'feature_fraction': 0.25239724278280634, 'feature_fraction_bynode': 0.5299935456013781, 'lambda_l1': 0.07174265127061853, 'lambda_l2': 3.1819444005716135e-06, 'learning_rate': 0.05426499478705572, 'num_leaves': 20}. Best is trial 0 with value: 0.47197195530033087.[0m
[32m[I 2023-03-19 21:46:53,443][0m Trial 1 finished with value: 0.4522203071278927 and parameters: {'drop_rate': 0.0741641090884101, 'max_drop': 6, 'skip_drop': 0.37946362239344056, 'max_depth': 5, 'min_child_samples': 23, 'bagging_fraction': 0.7118530880323163, 'bagging_freq': 7, 'feature_fraction': 0.84702715246

[32m[I 2023-03-20 08:04:22,692][0m Trial 32 finished with value: 0.44389251237472305 and parameters: {'drop_rate': 0.1440673701858478, 'max_drop': 66, 'skip_drop': 0.6622209230987822, 'max_depth': 4, 'min_child_samples': 16, 'bagging_fraction': 0.86590874727972, 'bagging_freq': 0, 'feature_fraction': 0.7805127795308324, 'feature_fraction_bynode': 0.48220313238835577, 'lambda_l1': 3.5791642225370223e-06, 'lambda_l2': 1.2247924731252938e-05, 'learning_rate': 0.08173341850235179, 'num_leaves': 114}. Best is trial 8 with value: 0.4434747376662068.[0m
[32m[I 2023-03-20 09:00:27,198][0m Trial 33 finished with value: 0.4447545855311624 and parameters: {'drop_rate': 0.14174915926352907, 'max_drop': 64, 'skip_drop': 0.724769585742489, 'max_depth': 4, 'min_child_samples': 17, 'bagging_fraction': 0.8604899215822286, 'bagging_freq': 0, 'feature_fraction': 0.7841155843619241, 'feature_fraction_bynode': 0.5856414683402598, 'lambda_l1': 3.937461138712841e-06, 'lambda_l2': 6.551701775406122e-05, 

[32m[I 2023-03-20 17:23:36,748][0m Trial 82 pruned. Trial was pruned at iteration 4250.[0m
[32m[I 2023-03-20 17:23:49,410][0m Trial 83 pruned. Trial was pruned at iteration 13.[0m
[32m[I 2023-03-20 18:20:50,323][0m Trial 84 finished with value: 0.4436394078041911 and parameters: {'drop_rate': 0.048255846591648714, 'max_drop': 95, 'skip_drop': 0.6702823806497216, 'max_depth': 4, 'min_child_samples': 66, 'bagging_fraction': 0.9080695222862849, 'bagging_freq': 11, 'feature_fraction': 0.6689573953951344, 'feature_fraction_bynode': 0.4929108700285945, 'lambda_l1': 0.1098053462108551, 'lambda_l2': 0.0004503009357987117, 'learning_rate': 0.08377740123144017, 'num_leaves': 214}. Best is trial 81 with value: 0.4432953622815359.[0m
[32m[I 2023-03-20 19:24:13,602][0m Trial 85 finished with value: 0.44390589444963025 and parameters: {'drop_rate': 0.08010107182284039, 'max_drop': 90, 'skip_drop': 0.6776322579650669, 'max_depth': 4, 'min_child_samples': 69, 'bagging_fraction': 0.914368770

[32m[I 2023-03-21 01:00:34,345][0m Trial 131 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 01:00:48,822][0m Trial 132 pruned. Trial was pruned at iteration 2.[0m
[32m[I 2023-03-21 01:01:02,789][0m Trial 133 pruned. Trial was pruned at iteration 2.[0m
[32m[I 2023-03-21 01:07:31,750][0m Trial 134 pruned. Trial was pruned at iteration 7941.[0m
[32m[I 2023-03-21 01:07:46,557][0m Trial 135 pruned. Trial was pruned at iteration 2.[0m
[32m[I 2023-03-21 01:11:18,844][0m Trial 136 pruned. Trial was pruned at iteration 4518.[0m
[32m[I 2023-03-21 01:11:39,732][0m Trial 137 pruned. Trial was pruned at iteration 197.[0m
[32m[I 2023-03-21 01:11:53,167][0m Trial 138 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 01:12:06,811][0m Trial 139 pruned. Trial was pruned at iteration 8.[0m
[32m[I 2023-03-21 01:12:19,939][0m Trial 140 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 01:18:00,277][0m Trial 141 pruned. Trial was pruned 

[32m[I 2023-03-21 05:31:37,328][0m Trial 198 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 05:31:49,991][0m Trial 199 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 05:32:03,092][0m Trial 200 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 05:32:15,999][0m Trial 201 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2023-03-21 05:32:29,205][0m Trial 202 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 05:32:42,232][0m Trial 203 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2023-03-21 06:17:52,759][0m Trial 204 finished with value: 0.4449208373486937 and parameters: {'drop_rate': 0.044255776208326485, 'max_drop': 64, 'skip_drop': 0.7365418515953888, 'max_depth': 4, 'min_child_samples': 10, 'bagging_fraction': 0.856502417367178, 'bagging_freq': 0, 'feature_fraction': 0.6967291566240559, 'feature_fraction_bynode': 0.6275613411853981, 'lambda_l1': 1.182994778589365e-06, 'lambda_l2': 5.484023928291786e-05, 'l

[32m[I 2023-03-21 13:02:18,161][0m Trial 242 pruned. Trial was pruned at iteration 4033.[0m
[32m[I 2023-03-21 13:02:32,370][0m Trial 243 pruned. Trial was pruned at iteration 13.[0m
[32m[I 2023-03-21 13:02:45,886][0m Trial 244 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 13:02:59,539][0m Trial 245 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 13:03:13,061][0m Trial 246 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2023-03-21 13:03:26,590][0m Trial 247 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2023-03-21 13:03:40,109][0m Trial 248 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 13:03:53,877][0m Trial 249 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 13:04:07,747][0m Trial 250 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 13:04:22,283][0m Trial 251 pruned. Trial was pruned at iteration 14.[0m
[32m[I 2023-03-21 13:04:37,878][0m Trial 252 pruned. Trial was pruned at 

[32m[I 2023-03-21 14:44:52,014][0m Trial 321 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 14:45:05,821][0m Trial 322 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 14:45:19,474][0m Trial 323 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 14:45:33,129][0m Trial 324 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 14:45:46,491][0m Trial 325 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 14:45:59,966][0m Trial 326 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 14:46:13,823][0m Trial 327 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 14:51:58,097][0m Trial 328 pruned. Trial was pruned at iteration 6183.[0m
[32m[I 2023-03-21 14:52:11,710][0m Trial 329 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 14:52:25,912][0m Trial 330 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 14:52:39,665][0m Trial 331 pruned. Trial was pruned at it

[32m[I 2023-03-21 15:53:47,690][0m Trial 405 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 15:54:01,610][0m Trial 406 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2023-03-21 15:54:15,968][0m Trial 407 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 15:54:29,475][0m Trial 408 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 15:54:43,502][0m Trial 409 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 15:54:57,284][0m Trial 410 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2023-03-21 15:55:11,275][0m Trial 411 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 15:55:24,848][0m Trial 412 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 15:55:39,197][0m Trial 413 pruned. Trial was pruned at iteration 9.[0m
[32m[I 2023-03-21 15:55:53,701][0m Trial 414 pruned. Trial was pruned at iteration 25.[0m
[32m[I 2023-03-21 15:56:08,089][0m Trial 415 pruned. Trial was pruned at iter

[32m[I 2023-03-21 19:08:35,214][0m Trial 487 finished with value: 0.4451287360001096 and parameters: {'drop_rate': 0.05684083160162887, 'max_drop': 76, 'skip_drop': 0.752408321771864, 'max_depth': 4, 'min_child_samples': 20, 'bagging_fraction': 0.9476733178340816, 'bagging_freq': 9, 'feature_fraction': 0.7578228375046713, 'feature_fraction_bynode': 0.3907093285618804, 'lambda_l1': 0.2971483296763664, 'lambda_l2': 0.00031384649678679253, 'learning_rate': 0.09355569910364583, 'num_leaves': 328}. Best is trial 81 with value: 0.4432953622815359.[0m
[32m[I 2023-03-21 19:08:50,461][0m Trial 488 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 19:12:45,468][0m Trial 489 pruned. Trial was pruned at iteration 3431.[0m
[32m[I 2023-03-21 19:12:59,265][0m Trial 490 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-03-21 19:13:12,792][0m Trial 491 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2023-03-21 19:13:27,357][0m Trial 492 pruned. Trial was pruned at 

KeyboardInterrupt: 

In [31]:
%%time

X = train_df
y = y_true

params = {'boosting_type': 'dart',
          'drop_rate': 0.04181725177738285,
          'max_drop': 90,
          'skip_drop': 0.681433217444748,
          'objective': 'binary',
          'num_iterations': 10000,
          'max_depth': 4,
          'min_child_samples': 10,
          'bagging_fraction': 0.879149476519461,
          'bagging_freq': 10,
          'feature_fraction': 0.6338351071617777,
          'feature_fraction_bynode': 0.4578052622858927,
          'lambda_l1': 0.09329820302670715,
          'lambda_l2': 0.00038402177706728873,
          'learning_rate': 0.08343032559963369,
          'num_leaves': 264,
          #'device_type': "gpu",
          'num_threads': 12,
          'verbose': -1, # output to stdout info about training process every 200 iterations
          'seed': 722
         }


lgb_train = lgb.Dataset(X, y)
clf = lgb.train(params,
                lgb_train,
                )    

CPU times: total: 2h 27min 55s
Wall time: 12min 34s


In [42]:
test_df['is_male'] = clf.predict(test_df)
test_df[['is_male']].to_csv('v124.1/lgbm_age_single_model10000.csv')

In [43]:
### 12500

In [44]:
%%time

X = train_df
y = y_true

params = {'boosting_type': 'dart',
          'drop_rate': 0.04181725177738285,
          'max_drop': 90,
          'skip_drop': 0.681433217444748,
          'objective': 'binary',
          'num_iterations': 12500,
          'max_depth': 4,
          'min_child_samples': 10,
          'bagging_fraction': 0.879149476519461,
          'bagging_freq': 10,
          'feature_fraction': 0.6338351071617777,
          'feature_fraction_bynode': 0.4578052622858927,
          'lambda_l1': 0.09329820302670715,
          'lambda_l2': 0.00038402177706728873,
          'learning_rate': 0.08343032559963369,
          'num_leaves': 264,
          #'device_type': "gpu",
          'num_threads': 12,
          'verbose': -1, # output to stdout info about training process every 200 iterations
          'seed': 722
         }


lgb_train = lgb.Dataset(X, y)
clf = lgb.train(params,
                lgb_train,
                )    

CPU times: total: 3h 30min 13s
Wall time: 18min 1s


In [45]:
del test_df['is_male']
test_df['is_male'] = clf.predict(test_df)
test_df[['is_male']].to_csv('v124.1/lgbm_age_single_model12500.csv')

In [30]:
### 5 folds

In [48]:
%%time

X = train_df
y = y_true

params = {'boosting_type': 'dart',
          'drop_rate': 0.04181725177738285,
          'max_drop': 90,
          'skip_drop': 0.681433217444748,
          'objective': 'binary',
          'num_iterations': 10000,
          'max_depth': 4,
          'min_child_samples': 10,
          'bagging_fraction': 0.879149476519461,
          'bagging_freq': 10,
          'feature_fraction': 0.6338351071617777,
          'feature_fraction_bynode': 0.4578052622858927,
          'lambda_l1': 0.09329820302670715,
          'lambda_l2': 0.00038402177706728873,
          'learning_rate': 0.08343032559963369,
          'num_leaves': 264,
          #'device_type': "gpu",
          'num_threads': 11,
          'verbose': -1, # output to stdout info about training process every 200 iterations
          'seed': 722
         }

preds = []

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
for n, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    lgb_train = lgb.Dataset(X.iloc[train_idx], y[train_idx])
    clf = lgb.train(params,
                lgb_train,
                )
    preds.append(clf.predict(test_df))

CPU times: total: 10h 55min 30s
Wall time: 1h 1min 54s


In [58]:
test_df['is_male'] = np.mean(preds, axis=0)
test_df[['is_male']].to_csv('v124.1/lgbm_age_5fold.csv')