## Imports

In [1]:
DEV = True

import pandas as pd
import numpy as np

import os
import json

from os import listdir
from os.path import isfile
from tqdm import tqdm_notebook as tqdm

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
%matplotlib inline

import lightgbm as lgb
import catboost as cb

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score as auc
from sklearn.metrics import fbeta_score, cohen_kappa_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.externals import joblib
from sklearn.preprocessing import scale

from bayes_opt import BayesianOptimization
from bayes_opt.observer import JSONLogger
from bayes_opt.event import Events
from bayes_opt.util import load_logs

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)
        
input_path = "../input/"
output_path = "../output/"
model_path = "../models/"

ensure_dir(model_path)
ensure_dir(output_path)

## Read data

In [2]:
train_df = pd.read_csv(input_path+'train.csv.zip')

label = train_df.target
train = train_df.drop(['ID_code','target'],axis=1)

test = pd.read_csv(input_path+'test.csv.zip')
test = test.drop(['ID_code'],axis=1)

test_filtered = pd.read_pickle(input_path+'test_filtered.pkl')
test_filtered = test_filtered.loc[:,train.columns]

train_test = pd.concat([train,test_filtered]).reset_index(drop=True)

In [3]:
vcs_train = {}
vcs_test = {}
vcs_train_test = {}

for col in tqdm(train.columns):
    vcs_train_test[col] = train_test.loc[:,col].value_counts()/300000

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [4]:
def feature_generator(df):
    for i in tqdm(range(200)):
        col = "var_"+str(i)
        vtraintest = vcs_train_test[col]
        t = vtraintest[df[col]].fillna(0).values

        df[col+'_train_test_sum_vcs'] = t
        df[col+'_train_test_sum_vcs_prod'] = df[col]*t
#         df[col+'_train_test_sum_vcs_sign'] = (df[col+"_train_test_sum_vcs_prod"]>0).astype(int)
#         df[col+'_train_test_sum_vcs_div'] = df[col]/t
#         df[col+'_train_test_sum_vcs_minus'] = scale(df[col]) - scale(t)
#         df[col+'_train_test_sum_vcs_plus'] = scale(df[col]) + scale(t)
#         df[col+'_train_test_sum_vcs_min'] = np.min(scale(df[col]), scale(t))
#         df[col+'_train_test_sum_vcs_max'] = np.max(scale(df[col]), scale(t))
#         df[col+'_train_test_sum_vcs_pow'] = np.power(abs(scale(df[col])), abs(scale(t)))
#         df[col+'_train_test_sum_vcs_log'] = np.log(abs(scale(df[col])), abs(scale(t)))
        
feature_generator(train)
feature_generator(test)

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [6]:
df = train
dt = test

cols = sorted(list(set(df.columns)-{"label"}))
print(len(cols))

df["label"] = label

600


In [7]:
df.label.value_counts()

0    179902
1     20098
Name: label, dtype: int64

## Scaling

In [None]:
scaler = StandardScaler() #MinMaxScaler StandardScaler RobustScaler

X = scaler.fit_transform(df[cols])
Z = scaler.transform(dt[cols])
y = list(df.label)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Logistic regression

In [None]:
%%time

df["clf"] = 0
dt["clf"] = 0

i = -1
for train_index, valid_index in skf.split(X, y):
    i+=1
        
    X_train = X[train_index, :]
    X_valid = X[valid_index, :]

    y_train = df.loc[train_index, "label"]
    y_valid = df.loc[valid_index, "label"]
    
    fname = model_path+'clf_'+str(i)+'.pkl'
    if DEV or not isfile(fname):
        c = 1
        clf = LogisticRegression(C=c,
                                 solver="newton-cg",
                                 penalty="l2", 
                                 n_jobs=-1, 
                                 max_iter=1000).fit(X_train, y_train) 
        joblib.dump(clf, fname)
    else:
        clf = joblib.load(fname)

    y_pred = clf.predict_proba(X_valid)[:,1] 
    df.loc[valid_index, "clf"] = y_pred
    print("ROC AUC:", round(auc(y_valid, y_pred), 4), 
          "\nF-score:", round(fbeta_score(y_valid, y_pred>0.5, beta=1), 4), i)
    
    dt["clf"] += clf.predict(Z)/5
    

print("\nROC AUC:", round(auc(df.label, df["clf"]), 4), 
      "\nF-score:", round(fbeta_score(df.label, df["clf"]>0.5, beta=1), 4))

## SVM

In [None]:
#cols+=["clf"]

scaler = StandardScaler() #MinMaxScaler StandardScaler RobustScaler

X = scaler.fit_transform(df[cols])
Z = scaler.transform(dt[cols])
y = list(df.label)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
%%time

df["svc"] = 0
dt["svc"] = 0

i = -1
for train_index, valid_index in skf.split(X, y):
    i+=1
        
    X_train = X[train_index, :]
    X_valid = X[valid_index, :]

    y_train = df.loc[train_index, "label"]
    y_valid = df.loc[valid_index, "label"]
    
    fname = model_path+'svc_'+str(i)+'.pkl'
    #print(fname)
    if DEV or not isfile(fname):
        c = 100
        svc = SVC(C=c,
                  probability=True).fit(X_train, y_train) 
        
        joblib.dump(svc, fname)
    else:
        svc = joblib.load(fname)

    y_pred = svc.predict_proba(X_valid)[:,1] 
    df.loc[valid_index, "svc"] = y_pred
    print("ROC AUC:", round(auc(y_valid, y_pred), 4), 
          "\nF-score:", round(fbeta_score(y_valid, y_pred>0.5, beta=1), 4), i)
    
    dt["svc"] += svc.predict(Z)/5


print("\nROC AUC:", round(auc(df.label, df["svc"]), 4), 
      "\nF-score:", round(fbeta_score(df.label, df["svc"]>0.5, beta=1), 4))

## Nearest Neighbors

In [None]:
#cols+=["svc"]

scaler = StandardScaler() #MinMaxScaler StandardScaler RobustScaler

X = scaler.fit_transform(df[cols])
Z = scaler.transform(dt[cols])
y = list(df.label)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
%%time

df["nei"] = 0
dt["nei"] = 0

i = -1
for train_index, valid_index in skf.split(X, y):
    i+=1
        
    X_train = X[train_index, :]
    X_valid = X[valid_index, :]

    y_train = df.loc[train_index, "label"]
    y_valid = df.loc[valid_index, "label"]
    
    fname = model_path+'nei_'+str(i)+'.pkl'
    if DEV or not isfile(fname):
        nei = KNeighborsClassifier(n_neighbors=50,
                                   p=1, n_jobs=-1).fit(X_train, y_train) 
        joblib.dump(nei, fname)
    else:
        nei = joblib.load(fname)

    y_pred = nei.predict_proba(X_valid)[:,1] 
    df.loc[valid_index, "nei"] = y_pred
    print("ROC AUC:", round(auc(y_valid, y_pred), 4), 
          "\nF-score:", round(fbeta_score(y_valid, y_pred>0.5, beta=1), 4), i)
    
    dt["nei"] += nei.predict(Z)/5
    

print("\nROC AUC:", round(auc(df.label, df["nei"]), 4), 
      "\nF-score:", round(fbeta_score(df.label, df["nei"]>0.5, beta=1), 4))

## LighGBM

In [8]:
#cols+=["nei"]

scaler = StandardScaler() #MinMaxScaler StandardScaler RobustScaler

X = scaler.fit_transform(df[cols])
Z = scaler.transform(dt[cols])
y = list(df.label)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [9]:
rounds = 10000
early_stop_rounds = 200

params = {'lambda_l1': 0, 
          'lambda_l2': 10,
          'feature_fraction':0.9,
          'learning_rate': 0.03, 
          'max_depth': 4,
          #'min_data_in_leaf':20,
          #'num_leaves':2**5-1,
          'boosting_type': 'gbrt', #dart gbrt
          'objective': 'binary', 
          'metric': 'auc',
          #'weight': [1, 0.5],
          #'device': 'gpu',
          #'gpu_platform_id': '0',
          #'gpu_device_id': '0',
          'max_bin': 1024,
          'n_jobs':-1
         }

In [11]:
%%time

df["lgb"] = 0
dt["lgb"] = 0

i = -1
for train_index, valid_index in skf.split(X, y):
    i+=1
        
    X_train = X[train_index, :]
    X_valid = X[valid_index, :]

    y_train = df.loc[train_index, "label"]
    y_valid = df.loc[valid_index, "label"]

    d_train = lgb.Dataset(X_train, y_train, feature_name=cols)
    d_valid = lgb.Dataset(X_valid, y_valid, feature_name=cols)    
    
    fname = model_path+'lgb_'+str(i)+'.pkl'
    if DEV or not isfile(fname):
        model = lgb.train(params,
                            d_train,
                            num_boost_round=rounds,
                            valid_sets=[d_train, d_valid],
                            valid_names=['train','valid'],
                            #feval=lgb_f1_score,
                            early_stopping_rounds=early_stop_rounds,
                            verbose_eval=100)
        
        joblib.dump(model, fname)
    else:
        model = joblib.load(fname)
    
    
    y_pred = model.predict(X_valid)
    df.loc[valid_index, "lgb"] = y_pred
    print("ROC AUC:", round(auc(y_valid, y_pred), 4), 
      "\nF-score:", round(fbeta_score(y_valid, y_pred>0.5, beta=1), 4), i)
    
    dt["lgb"] += model.predict(Z)/5
    break
    
print("\nROC AUC:", round(auc(df.label, df["lgb"]), 4), 
      "\nF-score:", round(fbeta_score(df.label, df["lgb"]>0.5, beta=1), 4))

# [3992]	train's auc: 0.989264	valid's auc: 0.913532
# ROC AUC: 0.9135 
# F-score: 0.5195 0

Training until validation scores don't improve for 200 rounds.
[100]	train's auc: 0.836572	valid's auc: 0.808477
[200]	train's auc: 0.876608	valid's auc: 0.844941
[300]	train's auc: 0.898476	valid's auc: 0.862659
[400]	train's auc: 0.912644	valid's auc: 0.874368
[500]	train's auc: 0.922846	valid's auc: 0.881846
[600]	train's auc: 0.930758	valid's auc: 0.887439
[700]	train's auc: 0.936658	valid's auc: 0.891257
[800]	train's auc: 0.941571	valid's auc: 0.894522
[900]	train's auc: 0.945676	valid's auc: 0.897096
[1000]	train's auc: 0.949255	valid's auc: 0.899294
[1100]	train's auc: 0.952249	valid's auc: 0.901108
[1200]	train's auc: 0.954903	valid's auc: 0.902687
[1300]	train's auc: 0.957296	valid's auc: 0.903811
[1400]	train's auc: 0.959548	valid's auc: 0.90502
[1500]	train's auc: 0.961488	valid's auc: 0.905927
[1600]	train's auc: 0.963235	valid's auc: 0.906565
[1700]	train's auc: 0.964867	valid's auc: 0.907214
[1800]	train's auc: 0.966461	valid's auc: 0.907862
[1900]	train's auc: 0.967905	

In [None]:
w = [1,2,5]

df["avg"] = (w[0]*df["clf"]+w[1]*df["svc"]+w[2]*df["lgb"])/sum(w)
dt["avg"] = (w[0]*dt["clf"]+w[1]*dt["svc"]+w[2]*dt["lgb"])/sum(w)

print("\nROC AUC:", round(auc(df.label, df["avg"]), 4), 
      "\nF-score:", round(fbeta_score(df.label, df["avg"]>0.5, beta=1), 4))

In [None]:
np.sum(dt["avg"] > 0.5) / len(dt), np.sum(df["avg"] > 0.5) / len(df)

In [None]:
predictions = list(dt["avg"] > 0.5)
np.mean(predictions[:2*len(predictions)//5])

In [None]:
dt[["Image","avg"]].to_csv("test_prediction_top5.csv", index=False)

In [None]:
fig, ax = plt.subplots(figsize=(15, 300))
lgb.plot_importance(model, max_num_features=len(cols), ax=ax)
plt.title("Light GBM Feature Importance")

Text(0.5,1,'Light GBM Feature Importance')

## False positive

In [None]:
df[(df["lgb"]>0.5)&(df.label!=1)]

## False negative

In [None]:
df[(df["lgb"]<0.5)&(df.label==1)]

## Correlation

In [None]:
df.corr().label