In [118]:
import time

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

# Helper functions

In [2]:
def change_dtypes(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        if (df[col].dtype == "object") and (df[col].nunique() < df.shape[0]):
            df[col] = df[col].astype("category")

        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)

        elif df[col].dtype == int:
            df[col] = df[col].astype(np.int32)

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtypes(df)
    return df

In [17]:
def train_test_col_align(df_train, df_test, exclude_cols=None):
    if exclude_cols is None:
        exclude_cols = []
    cols_train = df_train.columns.to_list()
    
    for col in exclude_cols:
        assert col in cols_train, col + " is not in df_train"
        
    test_cols = [col for col in cols_train if col not in exclude_cols]
    return df_train[test_cols + exclude_cols], df_test[test_cols]
    

class OneHotEncoder(BaseEstimator, TransformerMixin):
    def fit(self, train_df):
        df_cat = train_df.select_dtypes(["object", "category"])
        self._cat_cols = df_cat.columns.to_list()
        
        if len(self._cat_cols) > 0:
            self._cat_cols_ohe = pd.get_dummies(df_cat).columns.to_list()
        else:
            self._cat_cols_ohe = []
        return self
    
    def transform(self, df):
        if len(self._cat_cols) == 0:
            print("No cat cols in df_train, so do nothing.")
            return df
        
        df_cat = df.select_dtypes(["object", "category"])
        cat_cols = df_cat.columns.to_list()
        assert set(cat_cols) == set(self._cat_cols), "df does not have the same categorical cols as train_df"
        
        # one-hot encode
        df_cat = pd.get_dummies(df_cat)
        # drop redundant classes which my be present in test_df
        for col in df_cat.columns:
            if col not in self._cat_cols_ohe:
                df_cat = df_cat.drop([col], axis="columns")
        
        # if some some colums are lacking in test but present in train, make them will all zero 
        cat_cols_ohe = df_cat.columns.to_list()
        for col in self._cat_cols_ohe:
            if col not in cat_cols_ohe:
                df_cat[col] = 0
                df_cat[col] = df_cat[col].astype(np.uint8)
        
        num_cols = [col for col in df.columns if col not in cat_cols]
        df_num = df[num_cols]
        
        return pd.concat([df_num, df_cat], axis="columns")

In [129]:
def roc_auc(estimator, X_eval, y_eval):
    """
    :param estimator: sklearn estimator that have predict_proba() method
    :param X_eval: test features
    :param y_eval: test target
    :return: float
    """
    proba = estimator.predict_proba(X_eval)
    return roc_auc_score(y_eval, proba[:, 1])


def feature_importance_df(estimator, features):
    """
    :param estimator: an estimator object that has feature_importances_ attribute
    :param features: list of str, list of feature names
    :return: feature_imp, dataframe
    """
    feature_imp = pd.DataFrame({"feature": features, "importance": estimator.feature_importances_})
    feature_imp = feature_imp.sort_values(by=["importance"], ascending=False)
    return feature_imp


def write_submit_csv(estimator, X_test, id_test, out):
    """
    :param estimator: a sklearn estimator that has predict_proba() method
    :param X_test: df or array
    :param id_test: dataframe containing column "SK_ID_CURR"
    :param out: str, csv output file name
    :return: None
    """
    prob_test = estimator.predict_proba(X_test)[:, 1]
    submit = id_test
    submit["TARGET"] = prob_test
    submit.to_csv(out, index=False)
    return None

# Load data

In [4]:
time_start = time.time()

df_train = load_csv("data/data_/X_y_train.csv")
df_test = load_csv("data/data_/X_test.csv")
print("df_train.shape", df_train.shape)
print("df_test.shape", df_test.shape)
print("df_train.isnull().sum().sum:", df_train.isnull().sum().sum())
print("df_test.isnull().sum().sum:", df_test.isnull().sum().sum())

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

Memory usage before changing types 3972.43 MB
Memory usage after changing types 1950.56 MB
Memory usage before changing types 629.29 MB
Memory usage after changing types 308.86 MB
df_train.shape (307511, 1648)
df_test.shape (48744, 1647)
df_train.isnull().sum().sum: 0
df_test.isnull().sum().sum: 0
Elapsed Time 809.6063630580902


In [131]:
X_train = df_train.copy()
X_test = df_test.copy()

y_train = X_train["APPL_TARGET"].values
X_train = X_train.drop(["SK_ID_CURR", "APPL_TARGET"], axis="columns")

sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

# TODO go back and check feature extraction
# due to error
X_train = X_train.drop(["PRAP_AMT_DOWN_PAYMENT_IS_NONNEG_entropy"], axis="columns")
X_test = X_test.drop(["PRAP_AMT_DOWN_PAYMENT_IS_NONNEG_entropy"], axis="columns")
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

X_train.shape (307511, 1646)
X_test.shape (48744, 1646)
X_train.shape (307511, 1645)
X_test.shape (48744, 1645)


# Preprocessing

## One-hot encoding

In [132]:
# one-hot encode

ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)

# make sure that columns in train and test are aligned
X_train, X_test = train_test_col_align(X_train, X_test)
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

features = list(X_train.columns)

X_train.shape (307511, 1959)
X_test.shape (48744, 1959)


In [134]:
X_train.dtypes.value_counts()

float32    1524
uint8       359
int32        38
bool         38
dtype: int64

In [135]:
X_test.dtypes.value_counts()

float32    1524
uint8       359
int32        38
bool         38
dtype: int64

## standardization

In [136]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

X_train.shape (307511, 1959)
X_test.shape (48744, 1959)


## Split into train validation set for model selection

In [137]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=146)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((246008, 1959), (246008,), (61503, 1959), (61503,))

# Logistic regression

In [78]:
lr = LogisticRegression(max_iter=100)
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=1000)

In [91]:
auc_lr_train = roc_auc(lr, X_train, y_train)
print("AUC of Logistic regression model on the train set: %0.5f" % auc_lr_train)

AUC of Logistic regression model on an train set: 0.79025


In [92]:
auc_lr_val = roc_auc(lr, X_val, y_val)
print("AUC of Logistic regression model on the evaluation set: %0.5f" % auc_lr_val)

AUC of Logistic regression model on an evaluation set: 0.77404


In [98]:
lr.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [99]:
write_submit_csv(lr, X_test, sk_id_test, "data/submit_/baseline_lr.csv")

# Random forest

In [115]:
rf = RandomForestClassifier(n_estimators=5000, max_depth=10, min_samples_split=0.001, n_jobs=16, random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, min_samples_split=0.001, n_estimators=5000,
                       n_jobs=16, random_state=42)

In [116]:
auc_rf_train = roc_auc(rf, X_train, y_train)
print("AUC of Random Forest model on the train set: %0.5f" % auc_rf_train)

AUC of Random Forest model on an train set: 0.78582


In [117]:
auc_rf_val = roc_auc(rf, X_val, y_val)
print("AUC of Random Forest model on the evaluation set: %0.5f" % auc_rf_val)

AUC of Random Forest model on the evaluation set: 0.73822


# XGBOOST

In [119]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [120]:
auc_xgb_train = roc_auc(xgb, X_train, y_train)
print("AUC of XGBOOST model on the train set: %0.5f" % auc_xgb_train)

AUC of XGBOOST model on the train set: 0.90337


In [121]:
auc_xgb_val = roc_auc(xgb, X_val, y_val)
print("AUC of XGBOOST model on the train set: %0.5f" % auc_xgb_val)

AUC of XGBOOST model on the train set: 0.77762


In [139]:
feature_importance = feature_importance_df(xgb, features)
feature_importance.head(20)

Unnamed: 0,feature,importance
29,APPL_EXT_SOURCE_3,0.010384
1626,APPL_NAME_EDUCATION_TYPE_Higher education,0.009935
28,APPL_EXT_SOURCE_2,0.007672
1603,APPL_CODE_GENDER_M,0.005268
1168,INPA_DAYS_ENTRY_PAYMENT_mean_range,0.005247
1398,CCBA_MAX_AMT_BALANCE_TO_CREDIT_LIMIT_3_NEAREST...,0.004592
1624,APPL_NAME_INCOME_TYPE_Working,0.004463
990,INPA_DAYS_INSTAL_PAY_DIFF_ISPOSITIVE_entropy_min,0.003997
928,POBA_MEAN_CNT_INSTALMENT_FUTURE_6_NEAREST_range,0.003944
50,APPL_FLAG_DOCUMENT_3,0.003803


In [143]:
list(feature_importance["feature"].iloc[:20])

['APPL_EXT_SOURCE_3',
 'APPL_NAME_EDUCATION_TYPE_Higher education',
 'APPL_EXT_SOURCE_2',
 'APPL_CODE_GENDER_M',
 'INPA_DAYS_ENTRY_PAYMENT_mean_range',
 'CCBA_MAX_AMT_BALANCE_TO_CREDIT_LIMIT_3_NEAREST_min',
 'APPL_NAME_INCOME_TYPE_Working',
 'INPA_DAYS_INSTAL_PAY_DIFF_ISPOSITIVE_entropy_min',
 'POBA_MEAN_CNT_INSTALMENT_FUTURE_6_NEAREST_range',
 'APPL_FLAG_DOCUMENT_3',
 'APPL_FLAG_OWN_CAR_N',
 'APPL_NAME_EDUCATION_TYPE_Secondary / secondary special',
 'PRAP_DAYS_LAST_DUE_1ST_VERSION_IS_NONNEG_mode',
 'PRAP_NAME_CONTRACT_STATUS_Refused_mean',
 'CCBA_MAX_AMT_BALANCE_TO_CREDIT_LIMIT_6_NEAREST_mean',
 'APPL_OCCUPATION_TYPE_Core staff',
 'PRAP_NAME_YIELD_GROUP_high_mean',
 'BURE_AMT_CREDIT_SUM_DEBT_TO_SUM_mean',
 'BURE_AMT_CREDIT_SUM_DEBT_TO_SUM_range',
 'APPL_DEF_30_CNT_SOCIAL_CIRCLE']

In [138]:
len(features)

1959

In [144]:
0.010384 / 0.009935

1.045193759436336