In [None]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np 
import os

In [None]:
base_path = '/kaggle/input/cat-in-the-dat/'
submission_df = pd.read_csv(base_path+"sample_submission.csv")
train_df = pd.read_csv(base_path+"train.csv")
test_df = pd.read_csv(base_path+"test.csv")

train_df = train_df.set_index('id')
test_df = test_df.set_index('id')
print(train_df.shape, test_df.shape)

In [None]:
labels = train_df['target']
train_df = train_df.drop(columns=['target'])

In [None]:
concat_df = pd.concat([train_df, test_df])
print(concat_df.shape)
concat_df.head()

In [None]:
%%time
dummy_data = pd.get_dummies(concat_df, columns=concat_df.columns, drop_first=True, sparse=True).sparse.to_coo().tocsr()
print(dummy_data.shape)

In [None]:
%%time
train_dummy = dummy_data[:train_df.shape[0],:]
test_dummy = dummy_data[train_df.shape[0]:,:]
print(train_dummy.shape, test_dummy.shape)

In [None]:
val_counts = labels.value_counts()
class_weights = [val_counts.max() / val_counts[i] for i in range(len(val_counts))]
class_weights

# ML Models

In [None]:
train_oof_df = pd.DataFrame()
test_oof_df = pd.DataFrame()

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=np.random.randint(0,1e9))
lr_oof = np.zeros(train_dummy.shape[0])
lr_models_list = []
lr_fold_preds_list = []
for i, (train_idx, val_idx) in enumerate(folds.split(train_dummy, labels)):
    print('Fold {} starting...'.format(i))
    x_train = train_dummy[train_idx,:]
    y_train = labels.iloc[train_idx]
    x_val = train_dummy[val_idx,:]
    y_val = labels.iloc[val_idx]
    
    model = LogisticRegression(C=0.1, solver="lbfgs", max_iter=500,
                               class_weight=dict(zip(np.arange(len(class_weights)), class_weights)))
    model.fit(x_train, y_train)

    preds = model.predict_proba(x_val)[:, 1]
    score = roc_auc_score(y_val, preds)
    
    lr_oof[val_idx] = preds
    lr_models_list.append(model)
    lr_fold_preds_list.append(preds)

    print("Fold score: ", score)
    print('Fold {} complete!'.format(i))

In [None]:
# oof score
roc_auc_score(labels, lr_oof)

In [None]:
lr_preds_list = []
for model in lr_models_list:
    preds = model.predict_proba(test_dummy)[:,1]
    lr_preds_list.append(preds)

In [None]:
train_oof_df['logistic_regression'] = lr_oof
test_oof_df['logistic_regression'] = np.mean(lr_preds_list, axis=0)
print(train_oof_df.shape, test_oof_df.shape)

# Catboost

In [None]:
from catboost import CatBoostClassifier, Pool

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=np.random.randint(0,1e9))
catboost_oof = np.zeros(train_dummy.shape[0])
catboost_models_list = []
catboost_fold_preds_list = []
for i, (train_idx, val_idx) in enumerate(folds.split(train_dummy, labels)):
    print('Fold {} starting...'.format(i))
    x_train = train_dummy[train_idx,:]
    y_train = labels.iloc[train_idx]
    x_val = train_dummy[val_idx,:]
    y_val = labels.iloc[val_idx]
    
    train_pool = Pool(x_train, y_train)
    eval_pool = Pool(x_val, y_val)
    
    model = CatBoostClassifier(loss_function='CrossEntropy', eval_metric='AUC', iterations=1e6, learning_rate=0.1, random_seed=np.random.randint(0,1e9), use_best_model=True,
                               depth=8#, class_weights=class_weights
                              )
    model.fit(train_pool, eval_set=eval_pool, verbose_eval=250, early_stopping_rounds=500)

    preds = model.predict_proba(x_val)[:, 1]
    score = roc_auc_score(y_val, preds)
    
    catboost_oof[val_idx] = preds
    catboost_models_list.append(model)
    catboost_fold_preds_list.append(preds)

    print("Fold score: ", score)
    print('Fold {} complete!'.format(i))

In [None]:
# oof score
roc_auc_score(labels, catboost_oof)

In [None]:
catboost_preds_list = []
for model in catboost_models_list:
    preds = model.predict_proba(test_dummy)[:,1]
    catboost_preds_list.append(preds)

In [None]:
train_oof_df['catboost'] = catboost_oof
test_oof_df['catboost'] = np.mean(catboost_preds_list, axis=0)
print(train_oof_df.shape, test_oof_df.shape)
train_oof_df.corr()

# Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=np.random.randint(0,1e9))
adaboost_oof = np.zeros(train_dummy.shape[0])
adaboost_models_list = []
adaboost_fold_preds_list = []
for i, (train_idx, val_idx) in enumerate(folds.split(train_dummy, labels)):
    print('Fold {} starting...'.format(i))
    x_train = train_dummy[train_idx,:]
    y_train = labels.iloc[train_idx]
    x_val = train_dummy[val_idx,:]
    y_val = labels.iloc[val_idx]
    
    base_estimator = DecisionTreeClassifier(max_depth=1, class_weight=dict(zip(np.arange(len(class_weights)), class_weights)))
    model = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=1000, learning_rate=1
                              )
    model.fit(x_train, y_train)

    preds = model.predict_proba(x_val)[:, 1]
    score = roc_auc_score(y_val, preds)
    
    adaboost_oof[val_idx] = preds
    adaboost_models_list.append(model)
    adaboost_fold_preds_list.append(preds)

    print("Fold score: ", score)
    print('Fold {} complete!'.format(i))

In [None]:
# oof score
roc_auc_score(labels, adaboost_oof)

In [None]:
adaboost_preds_list = []
for model in adaboost_models_list:
    preds = model.predict_proba(test_dummy)[:,1]
    adaboost_preds_list.append(preds)

In [None]:
train_oof_df['adaboost'] = adaboost_oof
test_oof_df['adaboost'] = np.mean(adaboost_preds_list, axis=0)
print(train_oof_df.shape, test_oof_df.shape)
train_oof_df.corr()

# Lightgbm

In [None]:
import lightgbm as lgb

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=np.random.randint(0,1e9))
lgb_oof = np.zeros(train_dummy.shape[0])
lgb_models_list = []
lgb_fold_preds_list = []
for i, (train_idx, val_idx) in enumerate(folds.split(train_dummy, labels)):
    print('Fold {} starting...'.format(i))
    x_train = train_dummy[train_idx,:].astype('float32')
    y_train = labels.iloc[train_idx]
    x_val = train_dummy[val_idx,:].astype('float32')
    y_val = labels.iloc[val_idx]
    
    train_dataset = lgb.Dataset(x_train, label=y_train)
    eval_dataset = lgb.Dataset(x_val, y_val)
    
    params = {
        'objective': 'binary',
        'learning_rate': 0.3,
        'max_depth': 1,
        'scale_pos_weight': class_weights[1],
        'metric': 'auc'
    }
    
    model = lgb.train(params, train_dataset, 1000000, valid_sets=[train_dataset, eval_dataset], early_stopping_rounds=500,
                      verbose_eval=250)

    preds = model.predict(x_val)
    score = roc_auc_score(y_val, preds)
    
    lgb_oof[val_idx] = preds
    lgb_models_list.append(model)
    lgb_fold_preds_list.append(preds)

    print('Fold auc score:', score)
    print('Fold {} complete!'.format(i))

In [None]:
# oof score
roc_auc_score(labels, lgb_oof)

In [None]:
lgb_preds_list = []
for model in lgb_models_list:
    preds = model.predict(test_dummy.astype('float32'))
    lgb_preds_list.append(preds)

In [None]:
train_oof_df['lgb'] = lgb_oof
test_oof_df['lgb'] = np.mean(lgb_preds_list, axis=0)
print(train_oof_df.shape, test_oof_df.shape)
train_oof_df.corr()

# Extra Trees

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=np.random.randint(0,1e9))
oof = np.zeros(train_dummy.shape[0])
models_list = []
fold_preds_list = []
for i, (train_idx, val_idx) in enumerate(folds.split(train_dummy, labels)):
    print('Fold {} starting...'.format(i))
    x_train = train_dummy[train_idx,:]
    y_train = labels.iloc[train_idx]
    x_val = train_dummy[val_idx,:]
    y_val = labels.iloc[val_idx]
    
    model = ExtraTreesClassifier(n_estimators=2000, max_depth=4,
                                 class_weight=dict(zip(np.arange(len(class_weights)), class_weights)))
    model.fit(x_train, y_train)

    preds = model.predict_proba(x_val)[:,1]
    score = roc_auc_score(y_val, preds)
    
    oof[val_idx] = preds
    models_list.append(model)
    fold_preds_list.append(preds)

    print('Fold auc score:', score)
    print('Fold {} complete!'.format(i))

In [None]:
# oof score
roc_auc_score(labels, oof)

In [None]:
preds_list = []
for model in models_list:
    preds = model.predict_proba(test_dummy)[:,1]
    preds_list.append(preds)

In [None]:
train_oof_df['extra'] = oof
test_oof_df['extra'] = np.mean(preds_list, axis=0)
print(train_oof_df.shape, test_oof_df.shape)
train_oof_df.round().corr()

# Blending - logistic regression

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=np.random.randint(0,1e9))
oof = np.zeros(train_oof_df.shape[0])
models_list = []
fold_preds_list = []
for i, (train_idx, val_idx) in enumerate(folds.split(train_oof_df, labels)):
    print('Fold {} starting...'.format(i))
    x_train = train_oof_df.iloc[train_idx,:]
    y_train = labels.iloc[train_idx]
    x_val = train_oof_df.iloc[val_idx,:]
    y_val = labels.iloc[val_idx]
    
    model = LogisticRegression(C=0.1, solver="lbfgs", max_iter=500,
                               class_weight=dict(zip(np.arange(len(class_weights)), class_weights)))
    model.fit(x_train, y_train)

    preds = model.predict_proba(x_val)[:,1]
    score = roc_auc_score(y_val, preds)
    
    oof[val_idx] = preds
    models_list.append(model)
    fold_preds_list.append(preds)

    print('Fold auc score:', score)
    print('Fold {} complete!'.format(i))

In [None]:
# oof score
roc_auc_score(labels, oof)

In [None]:
preds_list = []
for model in models_list:
    preds = model.predict_proba(test_oof_df)[:,1]
    preds_list.append(preds)

In [None]:
test_oof_df['final_preds'] = np.mean(preds_list, axis=0)
print(test_oof_df.shape)
test_oof_df.head()

# Submission

In [None]:
submission_df["target"] = test_oof_df['final_preds']
submission_df.to_csv('submission.csv', index=False)