In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
import gc

## Prepare the feature

In [2]:
stack_list = [7,11,12,14,16,17] # chosen single models from 1st layer
x_train = []
x_test = []
for i in stack_list:
    res = pd.read_csv('train%d.csv'%i)
    y_train = res['label']
    x_train.append(res['train'])
    x_test.append(pd.read_csv('test%d.csv'%i)['test'])
x_train = np.transpose(np.asarray(x_train))
x_test = np.transpose(np.asarray(x_test))
print (np.shape(x_train), np.shape(x_test), np.shape(y_train))

(254386, 6) (172956, 6) (254386,)


## Different single models

### logistic regression

In [3]:
kf = KFold(n_splits=8)
n_fold = 0
score = 0
val_pred_list = []
test_pred_list = []
y_val_list = []
print ("+++++++++++++++++++ Logistic Regression +++++++++++++++++++++++")
for train_index, valid_index in kf.split(x_train):
    n_fold += 1
    print ("========== Fold %d: =========="%n_fold)
    
    # split samples
    x_tra, x_val = x_train[train_index], x_train[valid_index]
    y_tra, y_val = y_train[train_index], y_train[valid_index]
    
    # train the model
    model = linear_model.LogisticRegression(solver='saga', n_jobs=-1)
    model.fit(x_tra, y_tra)
    
    # predict
    val_pred = model.predict_proba(x_val)[:,1]
    val_pred_list.append(val_pred)
    y_val_list.append(y_val)
    test_pred = model.predict_proba(x_test)[:,1]
    test_pred_list.append(test_pred)
    
    score += log_loss(y_val, val_pred)
    del x_tra, x_val, y_tra, y_val, val_pred, test_pred, model
    gc.collect()
    
# make the feature
val_pred_list = np.concatenate(val_pred_list)
test_pred_list = np.asarray(test_pred_list)
y_val_list = np.concatenate(y_val_list)
test_pred_list = np.mean(test_pred_list, axis=0)

val_pred_list = np.squeeze(val_pred_list)
test_pred_list = np.squeeze(test_pred_list)

#np.shape(val_pred_list), np.shape(y_val_list), np.shape(test_pred_list)
print (score/8)

train = pd.DataFrame({'lr':val_pred_list, 'label':y_val_list})
test = pd.DataFrame({'lr':test_pred_list})

+++++++++++++++++++ Logistic Regression +++++++++++++++++++++++
0.18580469117820506


### xgboost

In [4]:
kf = KFold(n_splits=8)
n_fold = 0
score = 0
val_pred_list = []
test_pred_list = []
print ("+++++++++++++++++++ XGBoost +++++++++++++++++++++++")
for train_index, valid_index in kf.split(x_train):
    n_fold += 1
    print ("========== Fold %d: =========="%n_fold)
    
    # split samples
    x_tra, x_val = x_train[train_index], x_train[valid_index]
    y_tra, y_val = y_train[train_index], y_train[valid_index]

    # train the model
    model = xgb.XGBClassifier(
        #learning_rate = 0.02,
        n_estimators= 2000,
        max_depth= 5,
        min_child_weight= 3,
        gamma=0.9,                        
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs= -1,
        scale_pos_weight=1)
    #model.fit(x_tra, y_tra, eval_set=list(zip(x_val, y_val)), eval_metric='logloss', early_stopping_rounds=2)
    model.fit(x_tra, y_tra)
    
    # predict
    val_pred = model.predict_proba(x_val)[:,1]
    val_pred_list.append(val_pred)
    test_pred = model.predict_proba(x_test)[:,1]
    test_pred_list.append(test_pred)
    
    score += log_loss(y_val, val_pred)
    del x_tra, x_val, y_tra, model, val_pred, test_pred
    gc.collect()
    
# make the feature
val_pred_list = np.concatenate(val_pred_list)
test_pred_list = np.asarray(test_pred_list)
test_pred_list = np.mean(test_pred_list, axis=0)

val_pred_list = np.squeeze(val_pred_list)
test_pred_list = np.squeeze(test_pred_list)

#np.shape(val_pred_list), np.shape(test_pred_list)
print (score/8.0)
train.insert(1, 'xgb', val_pred_list)
test.insert(1, 'xgb', test_pred_list)

+++++++++++++++++++ XGBoost +++++++++++++++++++++++
0.17278295074179337


### SVM

In [10]:
kf = KFold(n_splits=8)
n_fold = 0
score = 0
val_pred_list = []
test_pred_list = []
print ("+++++++++++++++++++ SVM +++++++++++++++++++++++")
for train_index, valid_index in kf.split(x_train):
    n_fold += 1
    print ("========== Fold %d: =========="%n_fold)
    
    # split samples
    x_tra, x_val = x_train[train_index], x_train[valid_index]
    y_tra, y_val = y_train[train_index], y_train[valid_index]
    
    # train the model
    model = LinearSVC()
    model.fit(x_tra, y_tra)
    
    # predict
    val_pred = np.clip(model.predict(x_val), 0.08, 0.92)
    val_pred_list.append(val_pred)
    test_pred = np.clip(model.predict(x_test), 0.08, 0.92)
    test_pred_list.append(test_pred)
    
    score += log_loss(y_val, val_pred)
    del x_tra, x_val, y_tra, model, val_pred, test_pred
    gc.collect()
    
# make the feature
val_pred_list = np.concatenate(val_pred_list)
test_pred_list = np.asarray(test_pred_list)
test_pred_list = np.mean(test_pred_list, axis=0)

val_pred_list = np.squeeze(val_pred_list)
test_pred_list = np.squeeze(test_pred_list)

#np.shape(val_pred_list), np.shape(test_pred_list)
print (score/8)

train.insert(1, 'svc', val_pred_list)
test.insert(1, 'svc', test_pred_list)

+++++++++++++++++++ SVM +++++++++++++++++++++++
0.2841522483157444


ValueError: cannot insert svc, already exists

### random forest

In [6]:
kf = KFold(n_splits=8)
n_fold = 0
score = 0
val_pred_list = []
test_pred_list = []
print ("+++++++++++++++++++ Random Forest +++++++++++++++++++++++")
for train_index, valid_index in kf.split(x_train):
    n_fold += 1
    print ("========== Fold %d: =========="%n_fold)
    
    # split samples
    x_tra, x_val = x_train[train_index], x_train[valid_index]
    y_tra, y_val = y_train[train_index], y_train[valid_index]
    
    # train the model
    model = RandomForestClassifier(max_depth=8, n_jobs=-1)
    model.fit(x_tra, y_tra)
    
    # predict
    val_pred = model.predict_proba(x_val)[:,1]
    val_pred_list.append(val_pred)
    test_pred = model.predict_proba(x_test)[:,1]
    test_pred_list.append(test_pred)
    
    score += log_loss(y_val, val_pred)
    del x_tra, x_val, y_tra, model, val_pred, test_pred
    gc.collect()
    
# make the feature
val_pred_list = np.concatenate(val_pred_list)
test_pred_list = np.asarray(test_pred_list)
test_pred_list = np.mean(test_pred_list, axis=0)

val_pred_list = np.squeeze(val_pred_list)
test_pred_list = np.squeeze(test_pred_list)

#np.shape(val_pred_list), np.shape(test_pred_list)
print (score/8)
train.insert(1, 'rf', val_pred_list)
test.insert(1, 'rf', test_pred_list)

+++++++++++++++++++ Random Forest +++++++++++++++++++++++
0.16730529988876292


### K nearest neighbor

In [7]:
kf = KFold(n_splits=8)
n_fold = 0
score = 0
val_pred_list = []
test_pred_list = []
print ("+++++++++++++++++++ KNN +++++++++++++++++++++++")
for train_index, valid_index in kf.split(x_train):
    n_fold += 1
    print ("========== Fold %d: =========="%n_fold)
    
    # split samples
    x_tra, x_val = x_train[train_index], x_train[valid_index]
    y_tra, y_val = y_train[train_index], y_train[valid_index]
    
    # train the model
    model = KNeighborsClassifier(n_neighbors=20, n_jobs=-1)
    model.fit(x_tra, y_tra)
    
    # predict
    val_pred = np.clip(model.predict_proba(x_val)[:,1], 0.01, 0.99)
    val_pred_list.append(val_pred)
    test_pred = np.clip(model.predict_proba(x_test)[:,1], 0.01, 0.99)
    test_pred_list.append(test_pred)
    
    score += log_loss(y_val, val_pred)
    del x_tra, x_val, y_tra, model, val_pred, test_pred
    gc.collect()
    
# make the feature
val_pred_list = np.concatenate(val_pred_list)
test_pred_list = np.asarray(test_pred_list)
test_pred_list = np.mean(test_pred_list, axis=0)

val_pred_list = np.squeeze(val_pred_list)
test_pred_list = np.squeeze(test_pred_list)

#np.shape(val_pred_list), np.shape(test_pred_list)
print (score/8)
train.insert(1, 'knn', val_pred_list)
test.insert(1, 'knn', test_pred_list)

+++++++++++++++++++ KNN +++++++++++++++++++++++
0.18083557392347735


### Multi layer perceptron

In [8]:
kf = KFold(n_splits=8)
n_fold = 0
score = 0
val_pred_list = []
test_pred_list = []
print ("+++++++++++++++++++ MLP +++++++++++++++++++++++")
for train_index, valid_index in kf.split(x_train):
    n_fold += 1
    print ("========== Fold %d: =========="%n_fold)
    
    # split samples
    x_tra, x_val = x_train[train_index], x_train[valid_index]
    y_tra, y_val = y_train[train_index], y_train[valid_index]
    
    # train the model
    model = MLPClassifier(hidden_layer_sizes=(100, 100), batch_size=256, max_iter=40, early_stopping=True)
    model.fit(x_tra, y_tra)
    
    # predict
    val_pred = model.predict_proba(x_val)[:,1]
    val_pred_list.append(val_pred)
    test_pred = model.predict_proba(x_test)[:,1]
    test_pred_list.append(test_pred)
    
    score += log_loss(y_val, val_pred)
    del x_tra, x_val, y_tra, model, val_pred, test_pred
    gc.collect()
    
# make the feature
val_pred_list = np.concatenate(val_pred_list)
test_pred_list = np.asarray(test_pred_list)
test_pred_list = np.mean(test_pred_list, axis=0)

val_pred_list = np.squeeze(val_pred_list)
test_pred_list = np.squeeze(test_pred_list)

#np.shape(val_pred_list), np.shape(test_pred_list)
print (score/8)
train.insert(1, 'mlp', val_pred_list)
test.insert(1, 'mlp', test_pred_list)

+++++++++++++++++++ MLP +++++++++++++++++++++++
0.17311126660649262


## Write all features into CSV file

In [9]:
train.to_csv('train_2s.csv', index=False)
test.to_csv('test_2s.csv', index=False)