In [None]:
import os
import gc
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns

from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score as ras
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import lightgbm as lgb
import xgboost as xgb

from xgboost import XGBClassifier

In [None]:
PATH = './drive/MyDrive/SisFall_Preprocessed/'
TEST = False

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
users = pd.read_csv(os.path.join(PATH, 'user_details.csv'))
feats = pd.read_csv(os.path.join(PATH, 'user_compressed_feats.csv'))

sub = pd.DataFrame(feats[['user_test', 'target']].values, columns=['user_test','target'])
sub['target'] = sub['target'].astype(int)
sub['prediction'] = 0

print(f'Data Shape : {feats.shape}')

Data Shape : (4447, 29)


In [None]:
feats['user_id'] = feats['user_test'].apply(lambda x:x.split('_')[1])
feats = feats.merge(users, how='left', on='user_id')
feats['gender'] = feats['gender'].map({'M':0,'F':1})

use_cols = [x for x in feats.columns if x not in ['user_test','user_id','target']]

In [None]:
init_params = {'colsample_bytree': 0.2,
              'eval_metric': 'auc',
              'learning_rate': 0.25,
              'max_depth': 7,
              'n_estimators': 50,
              'objective': 'reg:logistic',
              'tree_method':"gpu_hist",
              'eval_metric': ['auc'],}

grid_params = {
            'n_estimators': [100, 150, 200],
            'learning_rate': [0.01, 0.025, 0.05],
            'max_depth': range(7,11),
            'colsample_bytree': [i/10.0 for i in range(1, 3)],
            'eval_metric': ['auc'],
            'objective':['reg:logistic'],
            'tree_method':["gpu_hist"]
          }

In [None]:
N_SPLITS = 5
folds = StratifiedKFold(n_splits=N_SPLITS)

model = xgb.XGBClassifier(**init_params)
grid = GridSearchCV(estimator=model, param_grid=grid_params,
                    scoring='roc_auc', n_jobs=-1,
                    cv=folds, verbose=3)
grid.fit(feats[use_cols], feats['target'])

In [None]:
best_params = {k: grid.best_params_.get(k, v) for k, v in init_params.items()}
best_params['verbosity'] = 3
best_params['objective'] = 'reg:logistic'
best_params['tree_method'] = "gpu_hist"
best_params

In [None]:
train_df, test_df, train_target, test_target = train_test_split(feats[use_cols],
                                                                feats['target'],
                                                                test_size=0.1,
                                                                stratify=feats['target'])
lgb_train_data = xgb.DMatrix(train_df, label=train_target)
lgb_test_data = xgb.DMatrix(test_df, label=test_target)
watchlist = [(xgb_train, 'train'), (xgb_val, 'valid')]

model = xgb.XGBClassifier(**best_params)
model.fit(train_df, train_target)

In [None]:
model.feature_importances_

array([0.01139338, 0.01048101, 0.01457033, 0.0371764 , 0.14890754,
       0.09507807, 0.024088  , 0.01335162, 0.0219824 , 0.01017205,
       0.02999194, 0.01597654, 0.01167662, 0.01275533, 0.02510568,
       0.04281436, 0.02601257, 0.01079643, 0.03223642, 0.00999278,
       0.01038349, 0.04496682, 0.14574675, 0.08886883, 0.02038886,
       0.01478647, 0.03793853, 0.02338902, 0.00317183, 0.00309162,
       0.00270833], dtype=float32)

In [None]:
feat_imp = [(model.feature_importances_[i], feats[use_cols].columns[i])
            for i in range(len(use_cols))]
feat_imp = sorted(feat_imp, key=lambda x:x[0], reverse=True)
feat_imp

In [None]:
kfeat = 3
use_feat_imp = [fi for fi in feat_imp][:kfeat]
use_feat_imp = [fi[-1] for fi in feat_imp][:kfeat]
print(f'Features Using : {use_feat_imp}')

In [None]:
model.get_booster().best_iteration

In [None]:
for fold, (trn_idx, val_idx) in tqdm(enumerate(folds.split(feats, feats['target'])),total=N_SPLITS):
  train_df, test_df = feats[use_feat_imp].iloc[trn_idx], feats[use_feat_imp].iloc[val_idx]
  train_target, test_target = feats['target'].iloc[trn_idx], feats['target'].iloc[val_idx]
  
  xgb_train = xgb.DMatrix(train_df, label = train_target)
  xgb_val = xgb.DMatrix(test_df, label = test_target)
  watchlist = [(xgb_train, 'train'), (xgb_val, 'valid')]

  model = xgb.train(best_params,xgb_train,
                    10000,watchlist,early_stopping_rounds=500,
                    maximize=True,
                    verbose_eval=100)
  
  val_preds = model.predict(xgb_val, ntree_limit=model.best_ntree_limit)
  sub.loc[sub.index.isin(val_idx), 'prediction'] += val_preds/N_SPLITS
  print(f'fold {fold}, Validation Predicition ROC-AUC : {round(ras(test_target, val_preds))}')
  print('==============================================')

oof_score = ras(sub['target'], sub['prediction'])
print(f'OOF ROC-AUC Score : {(oof_score)}')

In [None]:
sub.to_csv(os.path.join(PATH, 'sub_xgb.csv'), index=False)