In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import data

from sklearn.decomposition import PCA,TruncatedSVD,NMF
from sklearn.feature_selection import (chi2, f_classif,mutual_info_classif,
                                        SelectKBest,SelectFromModel,VarianceThreshold,RFECV)
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion,TransformerMixin
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import (AdaBoostClassifier,BaggingClassifier,ExtraTreesClassifier,
                              GradientBoostingClassifier,RandomForestClassifier,VotingClassifier)
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.svm import LinearSVC,SVC,NuSVC
from sklearn.naive_bayes import BernoulliNB,GaussianNB,MultinomialNB
from sklearn.linear_model import LogisticRegression,RidgeClassifier,SGDClassifier
from sklearn.neighbors import KNeighborsClassifier,RadiusNeighborsClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Drop same or zero columns
X_train,y_train,X_test = data.load_data()
same_columns = []
for i in range(0,595):
    if i not in same_columns:
        counter = i + 1
        while counter < 595:
            if X_train.iloc[:,i].equals(X_train.iloc[:,counter]):
                same_columns.append(counter)
            counter += 1
same_columns.append(2)
same_columns = list(set(same_columns))
for i in same_columns:
    X_train.drop(columns='X'+str(i+1), axis=1, inplace=True)
    X_test.drop(columns='X'+str(i+1), axis=1, inplace=True)

## Calculate Feature Importance using ensemble methods
----------

In [274]:
def calculate_importance(model, X,y):
    selector = SelectFromModel(model, threshold=1e-4)
    selector.fit(X, y)
    support = selector.get_support()
    feature = X.loc[:,support].columns.tolist()
    print(str(len(feature)), 'selected features')
    print(calculate_score(model,X,y))
    return support

def select_k_best(score,X_norm,X,y,k=150):
    selector = SelectKBest(score, k=k)
    selector.fit(X_norm, y)
    support = selector.get_support()
    feature = X.loc[:,support].columns.tolist()
    print(str(len(feature)), 'selected features')
    return support

def calculate_mean_importance(model_list,X,y):
    mean_importances = np.zeros([496])
    for model in model_list:
        model.fit(X_train, y_train)
        if model.feature_importances_.sum() == 1.0:
            mean_importances += model.feature_importances_
        else:
            print(model)
            print(model.feature_importances_.sum())
    return mean_importances/len(model_list)
        
def calculate_score(model,X,y):
    model.fit(X,y)
    y_pred = model.predict(X)
    return accuracy_score(y,y_pred)

In [278]:
models = []
model = RandomForestClassifier(n_estimators=100, max_depth=5)
models.append(model)
model = AdaBoostClassifier(learning_rate=0.1,n_estimators=100)
models.append(model)
#model = BaggingClassifier(n_estimators=100)
#selector = SelectFromModel(model, prefit=False)
model = ExtraTreesClassifier(n_estimators=100)
models.append(model)
model = GradientBoostingClassifier(learning_rate=0.1,n_estimators=100)
models.append(model)
model = DecisionTreeClassifier(max_depth=7)
models.append(model)
model = ExtraTreeClassifier(max_depth=15)
models.append(model)
model = XGBClassifier()
models.append(model)

average_importances = calculate_mean_importance(models,X_train,y_train)

model = LogisticRegression(C=10)
models.append(model)
model = LGBMClassifier()
models.append(model)

support = []

selector = VarianceThreshold(0.0008)
selector.fit(X_train)
support.append(selector.get_support())
feature = X_train.loc[:,selector.get_support()].columns.tolist()
print(str(len(feature)), 'selected features')

X_norm = MinMaxScaler().fit_transform(X_train)
support.append(select_k_best(chi2,X_norm,X_train, y_train, k=100))
support.append(select_k_best(f_classif,X_norm,X_train, y_train))
support.append(select_k_best(mutual_info_classif,X_norm, X_train, y_train))

#rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=5)
selector = RFECV(LogisticRegression(C=0.1), step=1, cv=5,scoring='accuracy')
selector = selector.fit(X_norm, y_train)
support.append(selector.support_)
feature = X_train.loc[:,selector.support_].columns.tolist()
print(str(len(feature)), 'selected features')
selector = RFECV(SVC(kernel="linear"), step=1, cv=5,scoring='accuracy')
selector = selector.fit(X_norm, y_train)
support.append(selector.support_)
feature = X_train.loc[:,selector.support_].columns.tolist()
print(str(len(feature)), 'selected features')


for model in models:
    support.append(calculate_importance(model,X_train, y_train))

388 selected features
100 selected features
150 selected features
150 selected features
14 selected features
61 selected features
445 selected features
1.0
45 selected features
1.0
496 selected features
1.0
158 selected features
1.0
15 selected features
0.9916666666666667
39 selected features
1.0
180 selected features
1.0
496 selected features
0.9083333333333333
133 selected features
1.0


In [281]:
average_importances

1.0000000008897456

In [303]:
feature_name = X_train.columns.tolist()
voting_dict = {}
voting_dict['Feature'] = feature_name
for i,vector in enumerate(support):
    voting_dict[i] = vector

# put all selection together
feature_selection_df = pd.DataFrame(voting_dict)
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
feature_selection_df['Average'] = average_importances
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Average','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(100)

Unnamed: 0,Feature,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Total,Average
1,X163,True,True,True,True,False,False,True,True,True,True,True,False,True,True,True,12,0.033836
2,X33,True,True,True,True,False,False,True,True,True,True,True,False,True,True,True,12,0.021938
3,X453,True,True,True,False,True,True,True,True,True,True,False,False,True,True,True,12,0.020994
4,X201,True,True,True,False,True,True,True,False,True,True,True,False,True,True,True,12,0.014624
5,X581,False,True,True,True,True,True,True,True,True,True,False,False,True,True,True,12,0.009608
6,X243,True,True,True,True,True,False,True,True,True,True,False,False,True,True,True,12,0.006006
7,X281,True,True,True,True,False,False,True,True,True,True,False,False,True,True,True,11,0.025387
8,X221,True,True,True,False,False,False,True,True,True,True,False,True,True,True,True,11,0.010215
9,X238,False,True,True,False,True,True,True,True,True,True,False,False,True,True,True,11,0.007548
10,X66,True,True,True,True,True,False,True,True,True,True,False,False,True,True,False,11,0.004694


In [364]:
selected_index = feature_selection_df.Feature[0:50].tolist()

In [365]:
model = LogisticRegression(dual=True)
gridParams = {
    'C':[0.001,0.01,0.1,1,10,30,50,100,1000],
    'max_iter':[100,1000],
}
grid = GridSearchCV(model, gridParams,
                    verbose=0,
                    cv=5,
                    n_jobs=2)
grid.fit(X_train.loc[:,selected_index],y_train)
grid.best_score_

0.7166666666666667

In [320]:
y_pred = grid.best_estimator_.predict(X_train.loc[:,selected_index])
accuracy_score(y_pred,y_train)

0.8083333333333333

In [321]:
y_pred = grid.best_estimator_.predict(X_test.loc[:,selected_index])
data.write_output(y_pred.reshape(-1,1))

In [358]:
params = {'boosting_type': 'gbdt',
          'max_depth' : 3,
          'objective': 'binary',
          'nthread': 3, # Updated from nthread
          'num_leaves': 31,
          'learning_rate': 0.05,
          'max_bin': 128,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error'
         }

# Create parameters to search
gridParams = {
    'learning_rate': [0.005],
    'n_estimators': [40,100],
    'num_leaves': [6,8,12,16],
    'boosting_type' : ['gbdt'],
    'colsample_bytree' : [0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    'lambda_l1': [1,1,5,2,3,5]
    }

# Create classifier to use. Note that parameters have to be input manually
# not as a dict!
mdl = LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          n_jobs = 3, # Updated from 'nthread'
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight'])

In [359]:
grid = GridSearchCV(mdl, gridParams,
                    verbose=0,
                    cv=4,
                    n_jobs=2)

In [360]:
grid.fit(X_train.loc[:,selected_index],y_train)
grid.best_score_

0.65

In [361]:
y_pred = grid.best_estimator_.predict(X_train.loc[:,selected_index])
accuracy_score(y_pred,y_train)

0.95

In [338]:
y_pred = grid.best_estimator_.predict(X_test.loc[:,selected_index])
data.write_output(y_pred.reshape(-1,1))

In [339]:
C_OPTIONS = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
gamma = [1,0.1,0.001,0.0001]
param_grid = {
        'C': C_OPTIONS,
        'gamma':gamma,
        'kernel':['linear','rbf']
    }

In [344]:
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid = GridSearchCV(SVC(), n_jobs=1, param_grid=param_grid, scoring="accuracy",cv=k_fold)
grid.fit(X_train.loc[:,selected_index],y_train)
grid.best_score_

0.6666666666666666

In [345]:
y_pred = grid.best_estimator_.predict(X_test.loc[:,selected_index])
data.write_output(y_pred.reshape(-1,1))
y_pred = grid.best_estimator_.predict(X_train.loc[:,selected_index])
accuracy_score(y_pred,y_train)

0.8083333333333333

In [367]:
model_neighbors = KNeighborsClassifier()
k_range = np.arange(1,31)
weights = ["uniform","distance"]
param_grid = dict(n_neighbors = k_range, weights = weights)
grid = GridSearchCV(model_neighbors, param_grid,scoring = "accuracy", cv = 5)
grid.fit(X_train.loc[:,selected_index],y_train)
grid.best_score_

0.6083333333333333

In [349]:
# Classify train and test
x_train,y,x_test = data.load_data()
all_data = pd.concat([x_train,x_test])
all_data.shape
all_y = np.concatenate((np.ones([120,1], dtype=int),np.zeros([80,1], dtype=int)))

clf = GradientBoostingClassifier()
clf.fit(all_data, all_y)
y_pred = clf.predict(all_data)
accuracy_score(all_y,y_pred)
feature_name = all_data.columns.tolist()
df_important = pd.DataFrame({"feature":feature_name,"importance":clf.feature_importances_})
effect = df_important.sort_values(by="importance", ascending=False).iloc[0:100,:]
features_effect = effect.feature.tolist()
feature_selection_without = feature_selection_df.loc[feature_selection_df["Feature"].isin(features_effect)]