In [1]:
import pandas as pd
import numpy as np
import h5py

from sklearn import model_selection, linear_model, metrics, pipeline, preprocessing

# Set random seed
np.random.seed(42)

In [8]:
# Load the data and separate the id's
X_data  = pd.read_hdf("cat.hdf5", "train")
y_data  = pd.read_hdf("cat.hdf5", "train_target")
X_test  = pd.read_hdf("cat.hdf5", "test")

# Store data ID
data_id = X_data.loc[:, "id"]
test_id = X_test.loc[:, "id"]

X_data.drop(columns="id", inplace=True)
X_test.drop(columns="id", inplace=True)

In [9]:
# Load the dummy data and separate the id's
X_data_dmy  = pd.read_hdf("cat.hdf5", "train_dmy")
X_test_dmy  = pd.read_hdf("cat.hdf5", "test_dmy")

X_data_dmy.drop(columns="id", inplace=True)
X_test_dmy.drop(columns="id", inplace=True)

In [11]:
# Split the data with labels into training and validation.
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_data, y_data, test_size=0.3)

X_train_dmy, X_val_dmy, y_train_dmy, y_val_dmy = \
    model_selection.train_test_split(X_data_dmy, y_data, test_size=0.3)

In [12]:
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape)
print(X_train_dmy.shape, y_train_dmy.shape, X_val_dmy.shape, y_val_dmy.shape, X_test_dmy.shape)

(210000, 40) (210000,) (90000, 40) (90000,) (200000, 40)
(210000, 275) (210000,) (90000, 275) (90000,) (200000, 275)


## Logisitic Regression as baseline model

In [13]:
logistic = pipeline.Pipeline(
    [
        ("min_max_scaler", preprocessing.MinMaxScaler()),
        ("logistic_classifier", linear_model.LogisticRegressionCV(
            solver="lbfgs", max_iter=2000, cv=5, n_jobs=-1))
    ]
)

logistic.fit(X_train_dmy, y_train_dmy);

In [16]:
def get_score(model):
    y_val_pred = model.predict_proba(X_val_dmy)[:, 1]
    val_score = metrics.roc_auc_score(y_val_dmy, y_val_pred)

    y_train_pred = model.predict_proba(X_train_dmy)[:, 1]
    train_score = metrics.roc_auc_score(y_train_dmy, y_train_pred)

    return (train_score, val_score)

In [17]:
get_score(logistic)

(0.7660802466134071, 0.7640294373295451)

## Now use XGBoost

In [10]:
from xgboost import XGBClassifier

In [11]:
# We do not use GridSearchCV:
# See https://github.com/dmlc/xgboost/issues/2819
# Instead we implement our own grid search.

from collections.abc import Iterable
import itertools

# Takes a parameter grid similar to the format in
# sklearn GridSearchCV and returns a list of names every time.
def get_grid_iter(param_grid):
    names = []
    values = []
    
    for param_name, param_values in param_grid.items():
        names.append(param_name)
        
        if isinstance(param_values, list):
            values.append(list(param_values))
        else:
            values.append(list([param_values]))
    
    it = (dict(zip(names, param)) for param in itertools.product(*values))
    item_cnt = np.prod(np.array([len(v) for v in values]))
    return it, item_cnt

# Similar to GridSearchCV
def hyper_opt(model_base, param_grid, metric, X_train, y_train, X_val, y_val, bin_prob, verbose=False):
    best_param = None
    best_param_id = -1
    best_score = -np.inf
    
    it, item_cnt = get_grid_iter(param_grid)
    
    step = 0
    
    for param in it:
        
        model = model_base(**param)
        model.fit(X_train, y_train)
        
        if bin_prob:
            y_train_pred = model.predict_proba(X_train)[:, 1]
            y_val_pred = model.predict_proba(X_val)[:, 1]
        else:
            y_train_pred = model.predict(X_train)
            y_val_pred = model.predict(X_val)
        
        train_score = metric(y_train, y_train_pred)
        val_score = metric(y_val, y_val_pred)
        
        if val_score > best_score:
            best_score = val_score
            best_param = param
            best_param_id = step
        
        step += 1
        if verbose:
            print("[%d/%d] train:%f test:%f best:%f id:%d" %
                (step, item_cnt, train_score, val_score, best_score, best_param_id))
            
    return best_score, best_param, best_param_id

In [12]:
xgb_param_grid = {
    "n_estimators": [500, 1000, 1500, 2500, 3000],
    'max_depth':[1, 2, 3, 4, 5],
    'objective':'binary:logistic',
    'subsample':[0.6, 0.8, 1], 
    'colsample_bytree':[0.6, 0.8, 1],
    'learning_rate':[0.001, 0.01, 0.1],
    'tree_method':'gpu_hist',
    'evalmetric':'auc'
}


In [13]:
#%%time
#score, param, param_id = hyper_opt(XGBClassifier, xgb_param_grid, metrics.roc_auc_score,
#          X_train, y_train, X_val, y_val,
#          bin_prob=True, verbose=True)

In [14]:
param_id = 587
best_param = list(get_grid_iter(xgb_param_grid)[0])[param_id]
best_param

{'n_estimators': 3000,
 'max_depth': 2,
 'objective': 'binary:logistic',
 'subsample': 1,
 'colsample_bytree': 0.6,
 'learning_rate': 0.1,
 'tree_method': 'gpu_hist',
 'evalmetric': 'auc'}

In [15]:
xgb  = XGBClassifier(**best_param)
xgb.fit(X_data, y_data)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, evalmetric='auc',
              gamma=0, gpu_id=-1, learning_rate=0.1, max_delta_step=0,
              max_depth=2, min_child_weight=1, missing=None, n_estimators=3000,
              n_jobs=1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='gpu_hist', verbosity=1)

In [16]:
logistic_proba   = logistic.predict_proba(X_test)[:, 1]
xgb_proba        = xgb.predict_proba(X_test)[:,1]

In [17]:
logistic_score   = logistic.score(X_val,y_val)
xgb_score        = xgb.score(X_val,y_val)

logistic_weight  = logistic_score/(logistic_score + xgb_score)
xgb_weight       = xgb_score/(logistic_score + xgb_score)

target           = logistic_proba*logistic_weight + xgb_proba*xgb_weight
target

array([0.24827955, 0.5188349 , 0.18679945, ..., 0.18501229, 0.45643572,
       0.24577431])

In [18]:
submission = pd.DataFrame(
    {
        "id": test_id,
        "target": target
    }
)

In [19]:
submission.to_csv("submission.csv", index=False)

## check X_train AUC 

In [23]:
logistic_proba   = logistic.predict_proba(X_data)[:, 1]
xgb_proba        = xgb.predict_proba(X_data)[:,1]

target           = logistic_proba*logistic_weight + xgb_proba*xgb_weight
logistic_roc     = metrics.roc_auc_score(y_data, logistic_proba)
xgb_roc          = metrics.roc_auc_score(y_data, xgb_proba)
target_roc       = metrics.roc_auc_score(y_data, target)

print('logistic_roc: %f,  xgb_roc: %f,  target_roc: %f,'%(logistic_roc, xgb_roc, target_roc))

logistic_roc: 0.765422,  xgb_roc: 0.788937,  target_roc: 0.780851,


## From above we know that Logistics Regression is shit. Kick it off!

## Model Selection

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [20]:
#def get_roc(model, X_train, y_train, metric=metrics.roc_auc_score):
    #cv = model_selection.KFold(n_splits=5)
    #score = model_selection.cross_vaßl_score(model,
            #X_train,y_train,scoring=metrics.make_scorer(metric), cv=cv)
    #return score.mean()

In [21]:
def get_roc(model,X_train,y_train,X_val,y_val):
    model.fit(X_train, y_train)
    y_val_pred = model.predict_proba(X_val)[:, 1]
    val_score = metrics.roc_auc_score(y_val, y_val_pred)

    y_train_pred = model.predict_proba(X_train)[:, 1]
    train_score = metrics.roc_auc_score(y_train, y_train_pred)

    return (train_score, val_score)

In [None]:
svc = pipeline.Pipeline([
        ("min_max_scaler", preprocessing.MinMaxScaler()),
        ("logistic_classifier", SVC(gamma='auto',probability=True))
        ])

knn = pipeline.Pipeline([
        ("min_max_scaler", preprocessing.MinMaxScaler()),
        ("logistic_classifier", KNeighborsClassifier(n_neighbors = 3,n_jobs=-1))
        ])

logistic = pipeline.Pipeline([
        ("min_max_scaler", preprocessing.MinMaxScaler()),
        ("logistic_classifier", LogisticRegression(solver='lbfgs',n_jobs=-1))
        ])

rf = pipeline.Pipeline([
        ("min_max_scaler", preprocessing.MinMaxScaler()),
        ("logistic_classifier", RandomForestClassifier(n_estimators=100,n_jobs=-1))
        ])

nb = pipeline.Pipeline([
        ("min_max_scaler", preprocessing.MinMaxScaler()),
        ("logistic_classifier", GaussianNB())
        ])

perceptron = pipeline.Pipeline([
        ("min_max_scaler", preprocessing.MinMaxScaler()),
        ("logistic_classifier", Perceptron(n_jobs=-1))
        ])

sgd = pipeline.Pipeline([
        ("min_max_scaler", preprocessing.MinMaxScaler()),
        ("logistic_classifier", SGDClassifier(n_jobs=-1))
        ])

lsvc = pipeline.Pipeline([
        ("min_max_scaler", preprocessing.MinMaxScaler()),
        ("logistic_classifier", LinearSVC(max_iter=10000))
        ])

decision_tree = pipeline.Pipeline([
        ("min_max_scaler", preprocessing.MinMaxScaler()),
        ("logistic_classifier", DecisionTreeClassifier())
        ])

In [None]:
train_score_list = []
val_score_list   = []

model_dict = {
             'Random Forest':rf,
             'Decision Tree':decision_tree
             }

model_dict_dmy = {
             'Support Vector Machines':svc,
             'KNN':knn,
             'Logistic Regression':logistic,
             'Naive Bayes':nb,
             'Perceptron':perceptron,
             'Stochastic Gradient Descent':sgd,
             'Linear SVC':lsvc,
             }
for model_name,model in model_dict.items():
    train_score, val_score = get_roc(model,X_train,y_train,X_val,y_val)
    train_score_list.append(train_score)
    val_score_list.append(val_score)
    
for model_name,model in model_dict_dmy.items():
    train_score, val_score = get_roc(model,X_train_dmy,y_train_dmy,X_val_dmy,y_val_dmy)
    train_score_list.append(train_score)
    val_score_list.append(val_score)
    

score_pd = pd.DataFrame({'model':list(model_dict.keys()),'Train score':train_score_list, 
                         'Validation score':val_score_list}) 
score_pd.sort_values(by='Validation score', ascending=False)
score_pd

### Create Dummy Data