In [2]:
import time
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# Helper functions

In [2]:
def change_dtypes(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        if (df[col].dtype == "object") and (df[col].nunique() < df.shape[0]):
            df[col] = df[col].astype("category")

        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)

        elif df[col].dtype == int:
            df[col] = df[col].astype(np.int32)

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtypes(df)
    return df

In [3]:
class Standardizer(BaseEstimator, TransformerMixin):
    def __init__(self, to_array=False):
        self._to_array = to_array
        
    def fit(self, df_train):
        num_cols = df_train.select_dtypes(["number"]).columns.to_list()
        self._mean = {col: df_train[col].mean() for col in num_cols}
        self._std = {col: df_train[col].std() for col in num_cols}
        return self
    
    def transform(self, df):
        for col in self._mean:
            if self._std[col] > 0:
                df[col] = (df[col] - self._mean[col]) / self._std[col]
                df[col] = df[col].astype("float32")
            else:
                print("WARNING: " + col + " has zero std.")
        if self._to_array:
            return df.values
        else:
            return df

In [4]:
def roc_auc(estimator, X_eval, y_eval):
    """
    :param estimator: sklearn estimator that have predict_proba() method
    :param X_eval: test features
    :param y_eval: test target
    :return: float
    """
    proba = estimator.predict_proba(X_eval)
    return roc_auc_score(y_eval, proba[:, 1])


def write_submit_csv(estimator, X_test, id_test, out):
    """
    :param estimator: a sklearn estimator that has predict_proba() method
    :param X_test: df or array
    :param id_test: dataframe containing column "SK_ID_CURR"
    :param out: str, csv output file name
    :return: None
    """
    prob_test = estimator.predict_proba(X_test)[:, 1]
    submit = id_test
    submit["TARGET"] = prob_test
    submit.to_csv(out, index=False)
    return None

In [None]:
def objective_lr(**params):
        estimator = LogisticRegression(max_iter=100, n_jobs=16, random_state=21083, 
                                       **params)
        estimator.fit(X_train, y_train)
        
        auc = roc_auc(estimator, X_val, y_val)
        return {"loss": -auc, "status": STATUS_OK}
    

def objective_rf(**params):
    estimator = RandomForestClassifier(n_estimators=500, n_jobs=16, random_state=21083, 
                                       **params)
    estimator.fit(X_train, y_train)
    
    auc = roc_auc(estimator, X_val, y_val)
    return {"loss": -auc, "status": STATUS_OK}


def objective_xgb(**params):
    estimator = XGBClassifier(n_jobs=16, random_state=21083, **params)
    estimator.fit(X_train, y_train)
    
    auc = roc_auc(estimator, X_val, y_val)
    return {"loss": -auc, "status": STATUS_OK}


def hyperopt(objective, params, X_train, y_train, X_val, y_val, num_eval):
    time_start = time.time()
    
    trials = Trials()
    best_param = fmin(objective, 
                      params, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate= np.random.RandomState(42))
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    return trials, best_param

# Load data

In [7]:
IN_DIR = "data/data1_"

In [5]:
time_start = time.time()

X_train = load_csv(os.path.join(IN_DIR, "X_y_sel_xgb_train.csv"))
X_test = load_csv(os.path.join(IN_DIR, "X_sel_xgb_test.csv"))

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)
print("X_train.isnull().sum().sum:", X_train.isnull().sum().sum())
print("X_test.isnull().sum().sum:", X_test.isnull().sum().sum())

y_train = X_train["APPL_TARGET"].values
X_train = X_train.drop(["APPL_TARGET"], axis="columns")
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

sk_id_test = load_csv(os.path.join(IN_DIR, "sk_id_test.csv"))

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

Memory usage before changing types 2691.34 MB
Memory usage after changing types 1346.90 MB
Memory usage before changing types 426.22 MB
Memory usage after changing types 213.30 MB
X_train.shape (307511, 1101)
X_test.shape (48744, 1100)
X_train.isnull().sum().sum: 0
X_test.isnull().sum().sum: 0
X_train.shape (307511, 1100)
X_test.shape (48744, 1100)
Memory usage before changing types 0.39 MB
Memory usage after changing types 0.20 MB
Elapsed Time 444.14280462265015


# Preprocessing

# Standardization

In [10]:
scaler = Standardizer(to_array=True)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

X_train.shape (307511, 1100)
X_test.shape (48744, 1100)


# Split into train and validation sets for model selection

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=21083)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((246008, 1100), (246008,), (61503, 1100), (61503,))

# Logistic regression

## Base line (not tuned) model

In [12]:
lr = LogisticRegression(max_iter=100)
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=500)

In [14]:
auc_lr_train = roc_auc(lr, X_train, y_train)
print("AUC of Logistic regression model on the train set: %0.5f" % auc_lr_train)

AUC of Logistic regression model on the train set: 0.78844


In [15]:
auc_lr_val = roc_auc(lr, X_val, y_val)
print("AUC of Logistic regression model on the evaluation set: %0.5f" % auc_lr_val)

AUC of Logistic regression model on the evaluation set: 0.77399


In [16]:
lr.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))
write_submit_csv(lr, X_test, sk_id_test, "data/submit_/lr_sel_xgb_baseline.csv")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [5]:
params_lr = {"C": hp.loguniform('C', np.log(0.0001), np.log(10))}

In [6]:
LogisticRegression?

# Random forest

In [17]:
rf = RandomForestClassifier(n_estimators=2000, min_samples_leaf=40, n_jobs=16, random_state=21083)
rf.fit(X_train, y_train)

RandomForestClassifier(min_samples_leaf=40, n_estimators=2000, n_jobs=16,
                       random_state=21083)

In [18]:
auc_rf_train = roc_auc(rf, X_train, y_train)
print("AUC of Random Forest model on the train set: %0.5f" % auc_rf_train)

AUC of Random Forest model on the train set: 0.93263


In [19]:
auc_rf_val = roc_auc(rf, X_val, y_val)
print("AUC of Random Forest model on the evaluation set: %0.5f" % auc_rf_val)

AUC of Random Forest model on the evaluation set: 0.75228


# XGBOOST

In [20]:
xgb = XGBClassifier(n_jobs=16)
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [21]:
auc_xgb_train = roc_auc(xgb, X_train, y_train)
print("AUC of XGBOOST model on the train set: %0.5f" % auc_xgb_train)

AUC of XGBOOST model on the train set: 0.89709


In [22]:
auc_xgb_val = roc_auc(xgb, X_val, y_val)
print("AUC of XGBOOST model on the train set: %0.5f" % auc_xgb_val)

AUC of XGBOOST model on the train set: 0.77105


In [None]:
feature_importance = feature_importance_df(xgb, features)
feature_importance.head(20)

In [None]:
feature_importance["importance_cumsum"] = feature_importance["importance"].cumsum()
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 6))
ax.plot(feature_importance["importance_cumsum"].values)
ax.set_xlabel("# of features")
ax.set_ylabel("Cumulative feature importance")