In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import data

from sklearn.decomposition import PCA,TruncatedSVD,NMF
from sklearn.feature_selection import (chi2, f_classif,mutual_info_classif,
                                        SelectKBest,SelectFromModel,VarianceThreshold,RFECV)
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion,TransformerMixin
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import (AdaBoostClassifier,BaggingClassifier,ExtraTreesClassifier,
                              GradientBoostingClassifier,RandomForestClassifier,VotingClassifier)
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.svm import LinearSVC,SVC,NuSVC
from sklearn.naive_bayes import BernoulliNB,GaussianNB,MultinomialNB
from sklearn.linear_model import LogisticRegression,RidgeClassifier,SGDClassifier,PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier,RadiusNeighborsClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Drop same or zero columns
X_train,y_train,X_test = data.load_data()
same_columns = []
for i in range(0,595):
    if i not in same_columns:
        counter = i + 1
        while counter < 595:
            if X_train.iloc[:,i].equals(X_train.iloc[:,counter]):
                same_columns.append(counter)
            counter += 1
same_columns.append(2)
same_columns = list(set(same_columns))
for i in same_columns:
    X_train.drop(columns='X'+str(i+1), axis=1, inplace=True)
    X_test.drop(columns='X'+str(i+1), axis=1, inplace=True)

## Calculate Feature Importance using ensemble methods
----------

In [3]:
def calculate_importance(model, X,y):
    selector = SelectFromModel(model, threshold=1e-4)
    selector.fit(X, y)
    support = selector.get_support()
    feature = X.loc[:,support].columns.tolist()
    print(str(len(feature)), 'selected features')
    print(calculate_score(model,X,y))
    return support

def select_k_best(score,X_norm,X,y,k=150):
    selector = SelectKBest(score, k=k)
    selector.fit(X_norm, y)
    support = selector.get_support()
    feature = X.loc[:,support].columns.tolist()
    print(str(len(feature)), 'selected features')
    return support

def calculate_mean_importance(model_list,X,y):
    mean_importances = np.zeros([496])
    for model in model_list:
        model.fit(X_train, y_train)
        if np.around(model.feature_importances_.sum(), decimals=1) == 1.0:
            mean_importances += model.feature_importances_
        else:
            print(model)
            print(model.feature_importances_.sum())
    return mean_importances/len(model_list)
        
def calculate_score(model,X,y):
    model.fit(X,y)
    y_pred = model.predict(X)
    return accuracy_score(y,y_pred)

In [4]:
models = []
support = []
model = RandomForestClassifier(n_estimators=100, max_depth=5)
models.append(model)
model = AdaBoostClassifier(learning_rate=0.1,n_estimators=100)
models.append(model)
#model = BaggingClassifier(n_estimators=100)
#selector = SelectFromModel(model, prefit=False)
model = ExtraTreesClassifier(n_estimators=100)
models.append(model)
model = GradientBoostingClassifier(learning_rate=0.1,n_estimators=100)
models.append(model)
model = DecisionTreeClassifier(max_depth=7)
models.append(model)
model = ExtraTreeClassifier(max_depth=15)
models.append(model)
model = XGBClassifier()
models.append(model)

average_importances = calculate_mean_importance(models,X_train,y_train)
threshold = 0.005
support_average = average_importances > threshold
support.append(support_average)
feature = X_train.loc[:,support_average].columns.tolist()
print(str(len(feature)), 'selected features')

model = LogisticRegression()
models.append(model)
model = LGBMClassifier()
models.append(model)


selector = VarianceThreshold(0.0008)
selector.fit(X_train)
support.append(selector.get_support())
feature = X_train.loc[:,selector.get_support()].columns.tolist()
print(str(len(feature)), 'selected features')

X_norm = MinMaxScaler().fit_transform(X_train)
support.append(select_k_best(chi2,X_norm,X_train, y_train, k=100))
support.append(select_k_best(f_classif,X_norm,X_train, y_train))
support.append(select_k_best(mutual_info_classif,X_norm, X_train, y_train))

#rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=5)
selector = RFECV(LogisticRegression(C=10), step=1, cv=5,scoring='accuracy')
selector = selector.fit(X_norm, y_train)
support.append(selector.support_)
feature = X_train.loc[:,selector.support_].columns.tolist()
print(str(len(feature)), 'selected features')
selector = RFECV(SVC(kernel="linear"), step=1, cv=5,scoring='accuracy')
selector = selector.fit(X_norm, y_train)
support.append(selector.support_)
feature = X_train.loc[:,selector.support_].columns.tolist()
print(str(len(feature)), 'selected features')


for model in models:
    support.append(calculate_importance(model,X_train, y_train))

49 selected features
388 selected features
100 selected features
150 selected features
150 selected features
97 selected features
61 selected features
440 selected features
1.0
45 selected features
1.0
496 selected features
1.0
163 selected features
1.0
15 selected features
0.9916666666666667
33 selected features
1.0
180 selected features
1.0
496 selected features
0.7333333333333333
133 selected features
1.0


In [5]:
feature_name = X_train.columns.tolist()
voting_dict = {}
voting_dict['Feature'] = feature_name
for i,vector in enumerate(support):
    voting_dict[i] = vector

# put all selection together
feature_selection_df = pd.DataFrame(voting_dict)
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
feature_selection_df['Average'] = average_importances
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Average','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(130)

Unnamed: 0,Feature,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,Total,Average
1,X163,True,True,True,True,True,False,False,True,True,True,True,True,True,True,True,True,14,0.033345
2,X33,True,True,True,True,True,True,False,True,True,True,True,True,False,True,True,True,14,0.021884
3,X243,True,True,True,True,True,True,False,True,True,True,True,False,True,True,True,True,14,0.006209
4,X347,True,False,True,True,True,True,True,True,True,True,True,False,True,True,True,True,14,0.005046
5,X453,True,True,True,True,False,True,True,False,True,True,True,False,True,True,True,True,13,0.019258
6,X201,True,True,True,True,False,True,True,True,False,True,True,True,False,True,True,True,13,0.014472
7,X581,True,False,True,True,True,True,True,True,True,True,True,False,False,True,True,True,13,0.009347
8,X238,True,False,True,True,False,True,True,True,True,True,True,False,True,True,True,True,13,0.007914
9,X281,True,True,True,True,True,False,False,True,True,True,True,False,False,True,True,True,12,0.021522
10,X387,True,True,True,True,False,True,False,True,True,True,True,False,False,True,True,True,12,0.008702


In [6]:
def grid_search(model,grid_params,X_train,y_train, features):
    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    grid = GridSearchCV(model, grid_params, cv=k_fold, n_jobs=-1,refit = True, verbose=3)
    grid.fit(X_train.loc[:,features],y_train)
    print("k-fold:", grid.best_score_)
    y_pred = grid.best_estimator_.predict(X_train.loc[:,features])
    print("training data:", accuracy_score(y_pred,y_train))
    return grid

def output(grid, X_test, features):
    y_pred = grid.best_estimator_.predict(X_test.loc[:,features])
    data.write_output(y_pred.reshape(-1,1))

In [7]:
class ModelTransformer(TransformerMixin):

    def __init__(self, model):
        self.model = model

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(self.model.predict(X))

classifier = FeatureUnion([
                ('gbr', ModelTransformer(GradientBoostingClassifier())),
                ('dtr', ModelTransformer(DecisionTreeClassifier())),
                ('etr', ModelTransformer(ExtraTreesClassifier())),
                ('rfr', ModelTransformer(RandomForestClassifier())),
                ('par', ModelTransformer(LGBMClassifier())),
                ('ada', ModelTransformer(AdaBoostClassifier())),
    ])

In [8]:
def select_k_features(k=102):
    return feature_selection_df.Feature[0:k].tolist()

In [97]:
# Logistic Regression
model = LogisticRegression(dual=True, max_iter=1000)
gridParams = {
    'C':[0.001,0.01,0.1,1,10,30,50,100,1000],
}
grid_lo = grid_search(model,gridParams, X_train,y_train,select_k_features(14))

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


k-fold: 0.675
training data: 0.7583333333333333


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    1.8s finished


In [98]:
grid_lo.best_params_

{'C': 1000}

In [9]:
print("Pipeline with PCA:")
pipe = Pipeline([
    ('reduce_dim',PCA()),
    ('classify', model)
])
N_FEATURES_OPTIONS = list(range(2,50,10))
gridParams = [
    {
        'reduce_dim': [PCA(), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify__C': [0.001,0.01,0.1,1,10,30,50,100,1000],
    }]
grid1_lo = grid_search(pipe,gridParams, X_train,y_train,select_k_features())

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  38 out of  45 | elapsed:    1.8s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    1.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


k-fold: 0.6833333333333333
training data: 0.7666666666666667
Pipeline with PCA:
Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 262 tasks      | elapsed:   19.0s


k-fold: 0.6833333333333333
training data: 0.8583333333333333


[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:   36.6s finished


In [None]:
# SVC
model = SVC()
C_OPTIONS = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
gamma = [1,0.1,0.001,0.0001]
gridParams = {
        'C': C_OPTIONS,
        'gamma':gamma,
        'kernel':['linear','rbf']
    }
grid_svc = grid_search(model,gridParams, X_train, y_train, features)

print("Pipeline with PCA:")
pipe = Pipeline([
    ('reduce_dim',PCA()),
    ('classify', model)
])
N_FEATURES_OPTIONS = list(range(2,50,10))
gridParams = [
    {
        'reduce_dim': [PCA(), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify': [model],
        'classify__C': C_OPTIONS,
        'classify__gamma': gamma,
        'classify__kernel': ['linear', 'rbf']
    }]
grid1_svc = grid_search(pipe,gridParams, X_train,y_train,features)

In [19]:
# Linear SVC
model = LinearSVC()
gridParams = {
    'dual': [True, False],
    'C': [0.001,0.01,0.1,0.25,1,10,100],
    'penalty': ["l2"]
    }
grid_li = grid_search(model,gridParams, X_train, y_train, select_k_features(20))


Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


k-fold: 0.7
training data: 0.825


[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:    0.3s finished


In [20]:
print("Pipeline with PCA:")
pipe = Pipeline([
    ('reduce_dim',PCA()),
    ('classify', model)
])
N_FEATURES_OPTIONS = list(range(2,50,10))
gridParams = [
    {
        'reduce_dim': [PCA(), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify': [model],
        'classify__C': [0.001,0.01,0.1,0.25,1,10,100],
        'classify__dual': [False],
        'classify__penalty': ['l1', 'l2']
    }]
grid1_li = grid_search(pipe,gridParams, X_train,y_train,select_k_features())

Pipeline with PCA:
Fitting 5 folds for each of 140 candidates, totalling 700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 543 tasks      | elapsed:   42.6s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:   58.0s finished


k-fold: 0.6916666666666667
training data: 0.75


In [32]:
# SGD Classifier
model = SGDClassifier()
gridParams = {
    'alpha': [10 ** x for x in range(-6, 1)],
    'l1_ratio': [0, 0.05, 0.1, 0.2, 0.5, 0.8, 0.9, 0.95, 1],
    'loss': ['log', 'hinge']
}
grid_sgd = grid_search(model,gridParams, X_train, y_train, select_k_features(20))
grid_sgd.best_params_

Fitting 5 folds for each of 126 candidates, totalling 630 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 392 tasks      | elapsed:    1.0s


k-fold: 0.6416666666666667
training data: 0.675


[Parallel(n_jobs=-1)]: Done 630 out of 630 | elapsed:    1.4s finished


{'alpha': 0.1, 'l1_ratio': 0.9, 'loss': 'log'}

In [34]:
print("Pipeline with PCA:")
pipe = Pipeline([
    ('reduce_dim',PCA()),
    ('classify', model)
])
N_FEATURES_OPTIONS = list(range(2,50,10))
gridParams = [
    {
        'reduce_dim': [PCA(), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify': [model],
        'classify__l1_ratio': [0, 0.05, 0.15, 0.5, 0.8, 0.95,1],
        'classify__alpha': [10 ** x for x in range(-6, 1)],
        'classify__loss': ['log', 'hinge','perceptron']
    }]
grid1_sgd = grid_search(pipe,gridParams, X_train,y_train,select_k_features())

Pipeline with PCA:
Fitting 5 folds for each of 1470 candidates, totalling 7350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 167 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 360 tasks      | elapsed:   31.0s
[Parallel(n_jobs=-1)]: Done 584 tasks      | elapsed:   53.6s
[Parallel(n_jobs=-1)]: Done 872 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1224 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1707 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 2224 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 2866 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 3592 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 4513 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 5351 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 6276 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 7140 tasks      | elapsed: 11.0min


k-fold: 0.675
training data: 0.7416666666666667


[Parallel(n_jobs=-1)]: Done 7350 out of 7350 | elapsed: 11.4min finished


In [39]:
# PassiveAggressiveClassifier
model = PassiveAggressiveClassifier(fit_intercept=True, early_stopping=True, average=10)
gridParams = {
    'C': [0.001,0.01,0.1,0.25,1,10,100],
    'loss': ['squared_hinge', 'hinge']
}
grid_pa = grid_search(model,gridParams, X_train, y_train, select_k_features(20))

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


k-fold: 0.6583333333333333
training data: 0.6916666666666667


[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:    0.3s finished


In [40]:
print("Pipeline with PCA:")
pipe = Pipeline([
    ('reduce_dim',PCA()),
    ('classify', model)
])
N_FEATURES_OPTIONS = list(range(2,50,10))
gridParams = [
    {
        'reduce_dim': [PCA(), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify': [model],
        'classify__C': [0.001,0.01,0.1,0.25,1,10,100],
        'classify__loss': ['squared_hinge', 'hinge']
    }]
grid1_pa = grid_search(pipe,gridParams, X_train,y_train,select_k_features())

Pipeline with PCA:
Fitting 5 folds for each of 140 candidates, totalling 700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 153 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done 610 tasks      | elapsed:   51.5s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:  1.0min finished


k-fold: 0.7
training data: 0.7333333333333333


In [47]:
# Ridge Classifier
model = RidgeClassifier(fit_intercept = True, normalize=True)
gridParams = {
    'solver' : ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    'alpha': [10 ** x for x in range(-6, 2)]
}
grid_ri = grid_search(model,gridParams, X_train, y_train, select_k_features(14))

Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


k-fold: 0.6833333333333333
training data: 0.75


[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed:    0.9s finished


In [48]:
print("Pipeline with PCA:")
pipe = Pipeline([
    ('reduce_dim',PCA()),
    ('classify', model)
])
N_FEATURES_OPTIONS = list(range(2,50,10))
gridParams = [
    {
        'reduce_dim': [PCA(), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify': [model],
        'classify__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
        'classify__alpha': [10 ** x for x in range(-6, 2)],
    }]
grid1_ri = grid_search(pipe,gridParams, X_train,y_train,select_k_features())

Pipeline with PCA:
Fitting 5 folds for each of 480 candidates, totalling 2400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 240 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 420 tasks      | elapsed:   34.8s
[Parallel(n_jobs=-1)]: Done 658 tasks      | elapsed:   58.9s
[Parallel(n_jobs=-1)]: Done 946 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1298 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1714 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 2194 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed:  3.8min finished


k-fold: 0.675
training data: 0.8083333333333333


In [50]:
# Xgbclassifier Classifier
features = feature_selection_df.Feature[0:14].tolist()
model = XGBClassifier(learning_rate=0.02, n_estimators=100,
                    silent=True, nthread=1)
gridParams = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [2,3, 4, 5]
        }

grid_xgb = grid_search(model,gridParams, X_train,y_train,select_k_features(14))

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 680 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 1640 tasks      | elapsed:   12.7s


k-fold: 0.7166666666666667
training data: 0.9583333333333334


[Parallel(n_jobs=-1)]: Done 2700 out of 2700 | elapsed:   22.3s finished


In [51]:
grid_xgb.best_params_

{'colsample_bytree': 0.8,
 'gamma': 1,
 'max_depth': 3,
 'min_child_weight': 1,
 'subsample': 0.6}

In [None]:
features = feature_selection_df.Feature[0:101].tolist()
print("Pipeline with PCA:")
pipe = Pipeline([
    ('reduce_dim',PCA()),
    ('classify', model)
])
N_FEATURES_OPTIONS = list(range(2,50,10))
gridParams = [
    {
        'reduce_dim': [PCA(), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify': [model],
        'classify__min_child_weight': [1, 5, 10],
        'classify__gamma': [0.5, 1, 1.5, 2, 5],
        'classify__subsample': [0.6, 0.8, 1.0],
        'classify__colsample_bytree': [0.6, 0.8, 1.0],
        'classify__max_depth': [2,3, 4, 5]
    }]
grid1_xgb = grid_search(pipe,gridParams, X_train,y_train,select_k_features())

In [61]:
# KNeigborsClassifier
k_range = np.arange(1,31)
weights = ["uniform","distance"]
gridParams = dict(n_neighbors = k_range, weights = weights)
grid_knn = grid_search(KNeighborsClassifier(),gridParams, X_train, y_train, select_k_features(20))

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


k-fold: 0.6166666666666667
training data: 0.6666666666666666


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    1.3s finished


In [107]:
# AdaBoost Classifier
model = AdaBoostClassifier(base_estimator=LogisticRegression(C=1000), learning_rate=0.01)
gridParams = {
        'n_estimators': [10,50,100,1000],
        'algorithm' : ['SAMME', 'SAMME.R']
        }

grid_ada = grid_search(model,gridParams, X_train,y_train,select_k_features(102))

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   19.2s finished


k-fold: 0.6666666666666666
training data: 0.9333333333333333


In [108]:
grid_ada.best_params_

{'algorithm': 'SAMME.R', 'n_estimators': 1000}

In [122]:
# Random Forest
model = RandomForestClassifier()
gridParams = {
    'n_estimators': [10,100,500,1000],
    'criterion': ['gini','entropy'],
    'max_depth': [None, 2, 3],
    'max_features': ["log2", "sqrt", None],
    'warm_start': [False, True]
        }

grid_ran = grid_search(model,gridParams, X_train,y_train,select_k_features(102))

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 236 tasks      | elapsed:   55.4s
[Parallel(n_jobs=-1)]: Done 396 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 620 tasks      | elapsed:  3.1min


k-fold: 0.7166666666666667
training data: 1.0


[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  3.8min finished


In [123]:
# Gradient Boost
model = GradientBoostingClassifier()
gridParams = {
    "loss":["deviance", "exponential"],
    "learning_rate": [0.01,0.1],
    "min_samples_split": np.linspace(0.1, 0.5, 6),
    "min_samples_leaf": np.linspace(0.1, 0.5, 6),
    "max_depth":[2,3,5,8],
    "max_features":["log2","sqrt"],
    "n_estimators":[10,100]
    }

grid_gbc = grid_search(model,gridParams, X_train,y_train,select_k_features(102))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 2304 candidates, totalling 11520 fits


[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 904 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 2184 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done 3976 tasks      | elapsed:   46.3s
[Parallel(n_jobs=-1)]: Done 6280 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 9096 tasks      | elapsed:  1.8min


k-fold: 0.7083333333333334
training data: 1.0


[Parallel(n_jobs=-1)]: Done 11520 out of 11520 | elapsed:  2.2min finished


In [132]:
# ExtraTrees
model = ExtraTreesClassifier(bootstrap=True, verbose=1)
gridParams = {
    "n_estimators":[10,100],
    'criterion': ['gini','entropy'],
    "max_depth":[2,3,5],
    "max_features":["log2","sqrt"],
    }

grid_extre = grid_search(model,gridParams, X_train,y_train,select_k_features(102))

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


k-fold: 0.675
training data: 0.9083333333333333


[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    3.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [138]:
voting = pd.DataFrame()
voting["Logistic"] = grid_lo.best_estimator_.predict(X_test[select_k_features(14)])
voting["Logistic-pca"] = grid1_lo.best_estimator_.predict(X_test[select_k_features()])
voting["Linear SVM"] = grid_lo.best_estimator_.predict(X_test[select_k_features(14)])
voting["Linear SVM-pca"] = grid1_lo.best_estimator_.predict(X_test[select_k_features()])
voting["SGD"] = grid_sgd.best_estimator_.predict(X_test[select_k_features(14)])
voting["SGd-pca"] = grid1_sgd.best_estimator_.predict(X_test[select_k_features()])
voting["PasAgg"] = grid_pa.best_estimator_.predict(X_test[select_k_features(20)])
voting["PasAgg-pca"] = grid1_pa.best_estimator_.predict(X_test[select_k_features()])
voting["Ridge"] = grid_ri.best_estimator_.predict(X_test[select_k_features(14)])
voting["Ridge-pca"] = grid1_ri.best_estimator_.predict(X_test[select_k_features()])
voting["Xgb"] = grid_xgb.best_estimator_.predict(X_test[select_k_features(102)])
#voting["Xgb-pca"] = grid1_xgb.best_estimator_.predict(X_test[features])
#voting["KNeighC"] = grid_knn.best_estimator_.predict(X_test[select_k_features(20)])
voting["Ada-Boost"] = grid_ada.best_estimator_.predict(X_test[select_k_features(102)])
voting["Ran-For"] = grid_ran.best_estimator_.predict(X_test[select_k_features(102)])
voting["Gra-Boost"] = grid_gbc.best_estimator_.predict(X_test[select_k_features(102)])
voting["Extra-trees"] = grid_extre.best_estimator_.predict(X_test[select_k_features(102)])
voting["Mean"] =  np.mean(voting, axis=1)
voting["Result"] = voting.Mean.apply(lambda x: 1 if x > 0.5 else 0)
voting.index = range(1, 81)
voting.head(50)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


Unnamed: 0,Logistic,Logistic-pca,Linear SVM,Linear SVM-pca,SGD,SGd-pca,PasAgg,PasAgg-pca,Ridge,Ridge-pca,Xgb,Ada-Boost,Ran-For,Gra-Boost,Extra-trees,Mean,Result
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1.0,1
2,0,1,0,1,0,0,1,0,0,1,1,1,1,1,1,0.6,1
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0
4,0,1,0,1,1,1,1,1,0,1,1,1,1,0,0,0.666667,1
5,0,1,0,1,1,0,1,1,1,0,0,0,0,0,0,0.4,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.066667,0
7,0,1,0,1,1,1,0,0,1,0,0,0,0,0,0,0.333333,0
8,0,1,0,1,1,1,1,1,0,0,1,1,1,0,0,0.6,1
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0
10,1,0,1,0,0,1,0,0,0,1,1,1,0,1,1,0.533333,1


In [143]:
data.write_output(voting["Ran-For"].values.reshape(-1,1))

In [81]:
# Lightgbm
params = {
    'boosting_type': 'gbdt',
    'max_depth' : 3,
    'objective': 'binary',
    'nthread': 3, # Updated from nthread
    'num_leaves': 31,
    'learning_rate': 0.05,
    'max_bin': 128,
    'subsample_for_bin': 200,
    'subsample': 1,
    'subsample_freq': 1,
    'colsample_bytree': 0.8,
    'reg_alpha': 5,
    'reg_lambda': 10,
    'min_split_gain': 0.5,
    'min_child_weight': 1,
    'min_child_samples': 5,
    'scale_pos_weight': 1,
    'num_class' : 1,
    'metric' : 'binary_error'
         }

# Create parameters to search
gridParams = {
    'learning_rate': [0.005],
    'n_estimators': [40,100],
    'num_leaves': [6,8,12,16],
    'boosting_type' : ['gbdt'],
    'colsample_bytree' : [0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    'lambda_l1': [1,1,5,2,3,5]
    }

# Create classifier to use. Note that parameters have to be input manually
# not as a dict!
mdl = LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          n_jobs = 3, # Updated from 'nthread'
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight'])

grid = grid_search(mdl,gridParams, X_train, y_train,select_k_features(20))

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:  5.7min


KeyboardInterrupt: 

In [None]:
# Classify train and test (not used now)
x_train,y,x_test = data.load_data()
all_data = pd.concat([x_train,x_test])
all_data.shape
all_y = np.concatenate((np.ones([120,1], dtype=int),np.zeros([80,1], dtype=int)))

clf = GradientBoostingClassifier()
clf.fit(all_data, all_y)
y_pred = clf.predict(all_data)
accuracy_score(all_y,y_pred)
feature_name = all_data.columns.tolist()
df_important = pd.DataFrame({"feature":feature_name,"importance":clf.feature_importances_})
effect = df_important.sort_values(by="importance", ascending=False).iloc[0:100,:]
features_effect = effect.feature.tolist()
feature_selection_without = feature_selection_df.loc[feature_selection_df["Feature"].isin(features_effect)]

In [None]:
X_train.shape