In [2]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.2.0-py3-none-win_amd64.whl (1.0 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.2.0


In [4]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import mode
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss, roc_auc_score, confusion_matrix
import gc
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 20)
np.random.seed(1729)

In [5]:
ROOTPATH = r'C:\\Users\Natalie Davis\Documents\BIPOC_Kaggle\tabular-playground-series-mar-2021\\'
DATAPATH = r'C:\\Users\Natalie Davis\Documents\BIPOC_Kaggle\tabular-playground-series-mar-2021\\'

TRAIN   = pd.read_csv(DATAPATH+'train.csv')
TEST    = pd.read_csv(DATAPATH+'test.csv')
cat_cols = [f for f in TRAIN.columns if 'cat' in f]
num_cols = [f for f in TRAIN.columns if 'cont' in f]
ID = 'id'
TARGET = 'target'

In [7]:
def st_dev(datafile):
    mu  = data.feature_value.mean()
    std = np.sqrt(np.mean( (datafile.feature_value - mu)**2 ) )
    return std

def id_dup_cols(datafile):
    remove = []
    cols = datafile.columns.tolist()
    for c in range(len(cols)-1):
        v = datafile[cols[c]].values
        for j in range(c+1,len(cols)):
            if np.array_equal(v,datafile[cols[j]].values):
                remove = remove + [cols[j], cols[c]]
    remove = pd.DataFrame(remove, columns=['feature','is_dup_of'])    
    return remove

def feature_stats(datafile):
    out=[]
    features = list( set(datafile.columns.tolist()) - set([ID,TARGET]) )
    for f in features:
        feature_type    = datafile[f].dtype
        trn_nonNA_rows  = datafile[(pd.isnull(datafile[f])==False) & (pd.isnull(datafile[TARGET])==False)].shape[0]
        tst_nonNA_rows  = datafile[(pd.isnull(datafile[f])==False) & (pd.isnull(datafile[TARGET])==True)].shape[0]
        level_count     = len(datafile[f].value_counts())
        if feature_type == np.object:
            datafile[f] = datafile[f].factorize(sort=True)[0]
        min_value       = datafile[f].min()
        per_10          = datafile[f].quantile(.10)
        per_25          = datafile[f].quantile(.25)
        med_value       = datafile[f].median()
        mean_value      = datafile[f].mean()
        mode_value      = mode(datafile[f])[0][0]
        mode_count      = mode(datafile[f])[1][0]
        per75           = datafile[f].quantile(.75)
        per_90          = datafile[f].quantile(.90)
        max_value       = datafile[f].max()
        val_range       = max_value - min_value
        val_stdev       = datafile[f].std()
        mean_nonNA_Resp = datafile[TARGET][np.isnan(datafile[f])==False].mean()
        corr            = datafile[[TARGET,f]].corr(method='pearson')
        target_corr     = corr[TARGET][f]
        
        out = out + [(f,feature_type,trn_nonNA_rows,tst_nonNA_rows,level_count,min_value,per_10,per_25,med_value,mean_value,mode_value,
                      per75,per_90,max_value,mode_count,val_range,val_stdev,mean_nonNA_Resp,target_corr)]
    
    outfile = pd.DataFrame(out,columns=['feature','feature_type','trn_nonNA_rows','tst_nonNA_rows','level_count',\
                                         'min_value','per_10','per_25','med_value','mean_value','mode_value','per75','per_90','max_value',\
                                         'mode_count','val_range','val_stdev','mean_nonNA_Resp','Response_corr'])
    return outfile

In [8]:
data = TRAIN.append(TEST)
data.reset_index(drop=True, inplace=True)

feature_stats = feature_stats(data)
dup_cols = id_dup_cols(data)
feature_stats = pd.merge(feature_stats, dup_cols, on='feature', how='left')

feature_stats.to_csv(DATAPATH+'Feature_Statistics.csv', index=False)
cols = feature_stats.feature.tolist()
data['dup_rows'] = data.duplicated(cols, keep=False)
print ('Duplicated Rows: ', data['dup_rows'].sum())

Duplicated Rows:  0


In [9]:
def add_folds(trn, numfolds):
    temp0 = trn[trn['target']==0]
    temp0['fold'] = np.random.randint(numfolds, size=temp0.shape[0])
    temp1 = trn[trn['target']==1]
    temp1['fold'] = np.random.randint(numfolds, size=temp1.shape[0])
    trn = temp0.append(temp1, ignore_index=True)
    trn.sort_values(['id'], inplace=True)
    trn.reset_index(drop=True, inplace=True)
    return trn

In [10]:
#Target Encode Categorical features
k_stat = 5
f_stat = 20

TRAIN = add_folds(TRAIN, 5)

def get_prob_tgt(trn, tst, Id, feature, tgt, prior, k, f):
    grouped = trn.groupby(feature, as_index=False)[tgt].agg({'size':np.size,'mean':np.mean})
    grouped['lambda'] = grouped['size'].apply(lambda n: 1.0/(1 + np.exp( (k-n)/f )))
    grouped[feature+'_tgtenc'] = (grouped['lambda'] * grouped['mean']) + ((1-grouped['lambda'])*prior)
    tst = tst.merge(grouped, on=feature, how='left')
    return tst[[Id, feature+'_tgtenc']]

mean_y  = TRAIN['target'].mean()
for col in cat_cols:
    tr_values = pd.DataFrame()
    for fld in range(5):
        tr = TRAIN[TRAIN['fold'] != fld]
        te = TRAIN[TRAIN['fold'] == fld]
        lvl_prob  = get_prob_tgt(tr[['id', col, 'target']].copy(), te[['id', col]].copy(), 'id', col, 'target', mean_y, k_stat, f_stat)
        tr_values = tr_values.append(lvl_prob)
    te_values = get_prob_tgt(TRAIN[['id', col, 'target']].copy(), TEST[['id', col]].copy(), 'id', col, 'target', mean_y, k_stat, f_stat)
    tr_values.replace([np.inf, -np.inf], mean_y, inplace=True)
    tr_values.fillna(mean_y, inplace=True)
    te_values.replace([np.inf, -np.inf], mean_y, inplace=True)
    te_values.fillna(mean_y, inplace=True)
    TRAIN = TRAIN.merge(tr_values, on=['id'], how='left')
    TEST = TEST.merge(te_values, on=['id'], how='left')


In [11]:
#LightGBM Model
param_dict = {}
param_dict['boosting_type']    = 'gbdt'
param_dict['objective']        = 'binary'
param_dict['metric']           = 'auc'
param_dict['num_leaves']       = 51
param_dict['learning_rate']    = 0.02
param_dict['colsample_bytree'] = 0.8
param_dict['subsample']        = 0.6
param_dict['max_depth']        = 7
param_dict['subsample_freq']   = 1
#param_dict['reg_alpha']        = 0.23
#param_dict['reg_lambda']       = 0.16
#param_dict['weight']           = 'name:wgt'
param_dict['bagging_seed']     = 351
param_dict['verbosity']        = -1
MAX_ROUNDS = 30000
STOP_ROUNDS = 50
VERBOSE_EVAL = 500

features = [f for f in TRAIN.columns if 'tgtenc' in f] + num_cols
TRAIN = add_folds(TRAIN, 5)
mean_score = 0.0
test_preds = np.zeros(TEST.shape[0])
for fold in range(5):
    trn_X = np.array(TRAIN.loc[TRAIN['fold']!=fold, features])
    trn_y = np.array(TRAIN.loc[TRAIN['fold']!=fold, 'target'])
    val_X = np.array(TRAIN.loc[TRAIN['fold']==fold, features])
    val_y = np.array(TRAIN.loc[TRAIN['fold']==fold, 'target'])
    model = lgb.LGBMRegressor(**param_dict, n_estimators=MAX_ROUNDS, n_jobs=-1)
    
    model.fit(trn_X, trn_y, eval_set=(val_X, val_y), verbose=VERBOSE_EVAL, early_stopping_rounds=STOP_ROUNDS)
    val_preds = model.predict(val_X)
    score = roc_auc_score(val_y, val_preds)
    mean_score += score/5
    print('Fold Score: ', score)
    
    test_preds += model.predict(np.array(TEST[features]))/5
    
print('CV Score: ', mean_score)

SUB = pd.DataFrame({'id':TEST['id'], 'target':test_preds})
SUB.to_csv(ROOTPATH+'submission1.csv', index=False)

Training until validation scores don't improve for 50 rounds
[500]	valid_0's auc: 0.891882
[1000]	valid_0's auc: 0.893744
[1500]	valid_0's auc: 0.894168
Early stopping, best iteration is:
[1628]	valid_0's auc: 0.894284
Fold Score:  0.8942838132795825
Training until validation scores don't improve for 50 rounds
[500]	valid_0's auc: 0.890483
[1000]	valid_0's auc: 0.892237
Early stopping, best iteration is:
[1423]	valid_0's auc: 0.892896
Fold Score:  0.892895708207069
Training until validation scores don't improve for 50 rounds
[500]	valid_0's auc: 0.890594
[1000]	valid_0's auc: 0.892295
[1500]	valid_0's auc: 0.892807
Early stopping, best iteration is:
[1516]	valid_0's auc: 0.892824
Fold Score:  0.8928243790600016
Training until validation scores don't improve for 50 rounds
[500]	valid_0's auc: 0.890393
[1000]	valid_0's auc: 0.89217
[1500]	valid_0's auc: 0.892722
Early stopping, best iteration is:
[1738]	valid_0's auc: 0.892889
Fold Score:  0.8928892674873078
Training until validation sco

In [None]:
#XGBoost Model
param_dict = {}
param_dict['silent']           = True
param_dict['objective']        = 'binary:logistic'
param_dict['booster']          = 'gbtree'
param_dict['eval_metric']      = 'auc'
#param_dict['tree_method']      = 'exact'
param_dict['eta']              = 0.02
param_dict['colsample_bytree'] = 0.8
param_dict['subsample']        = 0.8
param_dict['max_depth']        = 7
param_dict['min_child_weight'] = 51
# param_dict['gamma']            = 1.0
# param_dict['lambda']           = 0.5
# param_dict['alpha']            = 0.0
param_dict['base_score']       = 0.26
param_dict['seed']             = 1729
MAX_ROUNDS = 30000
STOP_ROUNDS = 50
VERBOSE_EVAL = 500

features = [f for f in TRAIN.columns if 'tgtenc' in f] + num_cols
TRAIN = add_folds(TRAIN, 5)
mean_score = 0.0
test_preds = np.zeros(TEST.shape[0])
d_test = xgb.DMatrix(TEST[features], missing=np.nan)
for fold in range(5):
    d_train = xgb.DMatrix(TRAIN.loc[TRAIN['fold']!=fold, features], label=TRAIN.loc[TRAIN['fold']!=fold, 'target'], missing=np.nan)
    d_valid = xgb.DMatrix(TRAIN.loc[TRAIN['fold']==fold, features], label=TRAIN.loc[TRAIN['fold']==fold, 'target'], missing=np.nan)
    val_y = np.array(TRAIN.loc[TRAIN['fold']==fold, 'target'])
    watchlist = [(d_train, 'train'), (d_valid, 'eval')]
    model = xgb.train(param_dict, d_train, MAX_ROUNDS, watchlist, early_stopping_rounds=STOP_ROUNDS,verbose_eval= VERBOSE_EVAL)
    val_preds = model.predict(xgb.DMatrix(TRAIN.loc[TRAIN['fold']==fold, features], missing=np.nan))
    score = roc_auc_score(val_y, val_preds)
    mean_score += score/5
    print('Fold Score: ', score)
    
    test_preds += model.predict(d_test)/5
    
print('CV Score: ', mean_score)

SUB = pd.DataFrame({'id':TEST['id'], 'target':test_preds})
SUB.to_csv(ROOTPATH+'submission2.csv', index=False)


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.86503	eval-auc:0.86204
[500]	train-auc:0.90109	eval-auc:0.89077
[1000]	train-auc:0.90919	eval-auc:0.89289
[1500]	train-auc:0.91466	eval-auc:0.89350
[1679]	train-auc:0.91631	eval-auc:0.89357
Fold Score:  0.8935695306797514
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.86589	eval-auc:0.86168
[500]	train-auc:0.90172	eval-auc:0.89039
[1000]	train-auc:0.90956	eval-auc:0.89245
[1500]	train-auc:0.91498	eval-auc:0.89315
[1617]	train-auc:0.91602	eval-auc: