In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn_pandas import CategoricalImputer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel, RFE
import lightgbm as lgb
from xgboost import XGBClassifier
from eli5 import show_weights
from eli5.sklearn import PermutationImportance

In [12]:
train = pd.read_csv("./census.csv")
test = pd.read_csv("./test_census.csv")

In [13]:
# numerical
num_cols = ['age', 'education-num', 'hours-per-week']

# categorical
cat_cols = ['workclass', 'education_level', 
            'marital-status', 'occupation', 
            'relationship', 'race', 
            'sex', 'native-country']

# need log transform
log_transform_cols = ['capital-loss', 'capital-gain']

In [14]:
minmax = MinMaxScaler()
onehot = OneHotEncoder(sparse=False)
cat = CategoricalImputer()
simp = SimpleImputer()
X_train = pd.get_dummies(train[cat_cols])
X_test = pd.get_dummies(test[cat_cols])
print(X_train.shape, X_test.shape)

(45222, 98) (45222, 98)


In [15]:
X_num = simp.fit_transform(train[num_cols].values)
#X_cat = cat.fit_transform()
#X_cat = onehot.fit_transform(X_train)
X_log = simp.fit_transform(train[log_transform_cols].values)
X_log = np.log1p(X_log)
X_num = minmax.fit_transform(X_num)
X_log = minmax.fit_transform(X_log)

test_num = simp.fit_transform(test[num_cols].values)
#test_cat = cat.fit_transform(test[cat_cols].values)
#test_cat = onehot.fit_transform(X_test)
test_log = simp.fit_transform(test[log_transform_cols].values)
test_log = np.log1p(test_log)
test_num = minmax.fit_transform(test_num)
test_log = minmax.fit_transform(test_log)

In [16]:
X = np.concatenate((X_num,X_log,X_train.values), axis=1)
test = np.concatenate((test_num, test_log, X_test.values), axis=1)
y = train['income'].map({'<=50K': 0, '>50K': 1})

In [160]:
param_grid = {
    "C": [0.1, 0.3, 0.5, 0.7, 0.9, 1, 2, 3, 4, 5, 10],
    "penalty": ['l1', 'l2'],
    "solver": ["liblinear", "saga"]
}
linear = LogisticRegression(class_weight="balanced", n_jobs=-1, max_iter=1000)
search = GridSearchCV(estimator=linear, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

In [161]:
search.fit(X,y)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                          dual=False, fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=1000, multi_class='warn',
                                          n_jobs=-1, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1,
                               2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'penalty': ['l1', 'l2'],
                         'solver': ['liblinear', 'saga']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             sco

In [162]:
print("AUC: "+str(search.best_score_))
print(search.best_params_)

AUC: 0.9070551203393995
{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
linear = LogisticRegression(class_weight="balanced", C=1, penalty='l1', solver='liblinear', n_jobs=-1, max_iter=1000)

linear.fit(X_train,y_train)
perm = PermutationImportance(linear).fit(X_test,y_test)


sel = SelectFromModel(perm, threshold=0.0001, prefit=True)
X_trans = sel.transform(X_train)
X_trans_test = sel.transform(X_test)
linear.fit(X_trans, y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='warn', n_jobs=-1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [8]:
show_weights(perm, show_feature_values=True, top=100)

Weight,Feature
0.0270  ± 0.0073,x30
0.0267  ± 0.0036,x1
0.0179  ± 0.0036,x4
0.0036  ± 0.0013,x38
0.0034  ± 0.0026,x50
0.0031  ± 0.0052,x2
0.0029  ± 0.0023,x44
0.0029  ± 0.0016,x42
0.0025  ± 0.0006,x3
0.0022  ± 0.0004,x49


In [18]:
predictions = linear.predict(X_trans_test)
print(roc_auc_score(y_test, predictions))
linear_reg = RandomForestClassifier(class_weight='balanced', n_estimators=300, criterion='entropy', n_jobs=-1, min_samples_split=4, min_weight_fraction_leaf=0.0, max_depth=21, max_leaf_nodes=512)
linear_reg.fit(X_train,y_train)
predict_reg = linear_reg.predict(X_test)
print(roc_auc_score(y_test, predict_reg))

rec = RFE(linear_reg, n_features_to_select=50, step=0.1,verbose=1)
rec.fit(X_train,y_train)
rec_pred = rec.predict(X_test)
print(roc_auc_score(y_test,rec_pred))

0.818349985496503
0.833050085409482
Fitting estimator with 103 features.
Fitting estimator with 93 features.
Fitting estimator with 83 features.
Fitting estimator with 73 features.
Fitting estimator with 63 features.
Fitting estimator with 53 features.
0.8329759564250491


In [19]:
params = {
    'base_estimator': [linear_reg],
    'learning_rate':[0.001, 0.005, 0.01, 0.1],
    'algorithm': ['SAMME', 'SAMME.R']
}
ada = AdaBoostClassifier(n_estimators=300, random_state=42)
grid = GridSearchCV(ada, params, verbose=1, cv=3, scoring='roc_auc',n_jobs=-1)
grid.fit(X,y)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker. The exit codes of the workers are {SIGKILL(-9)}

In [163]:
params = {
    'application': 'binary', # for binary classification
#     'num_class' : 1, # used for multi-classes
    'boosting': 'gbdt', # traditional gradient boosting decision tree
    'num_iterations': 100, 
    'learning_rate': 0.05,
    'num_leaves': 62,
    'device': 'cpu', # you can use GPU to achieve faster learning
    'max_depth': -1, # <0 means no limit
    'max_bin': 510, # Small number of bins may reduce training accuracy but can deal with over-fitting
    'lambda_l1': 5, # L1 regularization
    'lambda_l2': 10, # L2 regularization
    'metric' : 'binary_error',
    'subsample_for_bin': 200, # number of samples for constructing bins
    'subsample': 1, # subsample ratio of the training instance
    'colsample_bytree': 0.8, # subsample ratio of columns when constructing the tree
    'min_split_gain': 0.5, # minimum loss reduction required to make further partition on a leaf node of the tree
    'min_child_weight': 1, # minimum sum of instance weight (hessian) needed in a leaf
    'min_child_samples': 5# minimum number of data needed in a leaf
}

# Initiate classifier to use
mdl = lgb.LGBMClassifier(boosting_type= 'gbdt', 
          objective = 'binary', 
          n_jobs = 5, 
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'], 
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'], 
          min_split_gain = params['min_split_gain'], 
          min_child_weight = params['min_child_weight'], 
          min_child_samples = params['min_child_samples'])

# To view the default model parameters:
mdl.get_params().keys()

dict_keys(['boosting_type', 'class_weight', 'colsample_bytree', 'importance_type', 'learning_rate', 'max_depth', 'min_child_samples', 'min_child_weight', 'min_split_gain', 'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin', 'subsample_freq', 'max_bin'])

In [None]:
param_grid = {
    'learning_rate': [0.005, 0.01, 0.1],
    'n_estimators': [8,16,24, 100, 200, 300, 500, 1000],
    'num_leaves': [6,8,12,16,32], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
    'objective' : ['binary'],
    'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [500],
    'colsample_bytree' : [0.64, 0.65, 0.66],
    'subsample' : [0.75, 1],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    'lambda_l1' : [1,2,3,5,10],
    'lambda_l2' : [1,2,3,5,10],
    'min_split_gain' : [0.2,0.5,0.8],
    'min_child_samples': [2,3,4,5,6,10]
    }

grid = GridSearchCV(mdl, param_grid, verbose=1, cv=5, scoring="roc_auc", n_jobs=-1)
# Run the grid
grid.fit(X, y)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

Fitting 4 folds for each of 3456 candidates, totalling 13824 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 438 tasks      | elapsed: 31.6min


In [29]:
params = {
        #'n_estimators': [300, 500, 1000],
        'learning_rate': [0.001, 0.005, 0.01, 0.1],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5],
        'subsample': [0.75, 1.0],
        'colsample_bytree': [0.75, 1.0],
        'max_depth': [6, 12]
        }
xgb = XGBClassifier(n_estimators=300,objective='binary:logistic', silent=True, nthread=-1)
grid = GridSearchCV(xgb, params, verbose=1, cv=3, scoring='roc_auc', n_jobs=-1)
grid.fit(X,y)
print(grid.best_params_)
print(grid.best_score_)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 38.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 90.7min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 183.6min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed: 203.8min finished


{'colsample_bytree': 0.75, 'gamma': 1.5, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 1, 'subsample': 1.0}
0.9284752844243881


In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
xgb = XGBClassifier(n_estimators=300, objective='binary:logistic', silent=True, nthread=-1, colsample_bytree=0.75, gamma=1.5, learning_rate=0.1, max_depth=6, min_child_weight=1, subsample=1.0)
xgb.fit(X_train,y_train)
perm = PermutationImportance(xgb).fit(X_test,y_test)


sel = SelectFromModel(perm, threshold=0.0001, prefit=True)
X_trans = sel.transform(X_train)
X_trans_test = sel.transform(X_test)
xgb.fit(X_trans, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.75, gamma=1.5, learning_rate=0.1,
       max_delta_step=0, max_depth=6, min_child_weight=1, missing=None,
       n_estimators=300, n_jobs=1, nthread=-1, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1.0)

In [61]:
show_weights(perm, show_feature_values=True, top=100)

Weight,Feature
0.0580  ± 0.0018,x4
0.0245  ± 0.0028,x1
0.0213  ± 0.0032,x30
0.0199  ± 0.0014,x0
0.0163  ± 0.0013,x3
0.0077  ± 0.0012,x2
0.0075  ± 0.0024,x49
0.0038  ± 0.0028,x38
0.0033  ± 0.0007,x42
0.0032  ± 0.0004,x54


In [63]:
predictions = xgb.predict(X_trans_test)
print(roc_auc_score(y_test, predictions))
xgb_reg = RandomForestClassifier(class_weight='balanced', n_estimators=300, criterion='entropy', n_jobs=-1, min_samples_split=4, min_weight_fraction_leaf=0.0, max_depth=21, max_leaf_nodes=512)
xgb_reg.fit(X_train,y_train)
predict_reg = xgb_reg.predict(X_test)
print(roc_auc_score(y_test, predict_reg))

rec = RFE(xgb_reg, n_features_to_select=50, step=0.1,verbose=1)
rec.fit(X_train,y_train)
rec_pred = rec.predict(X_test)
print(roc_auc_score(y_test,rec_pred))

0.8030370322622232
0.8348583491797468
Fitting estimator with 103 features.
Fitting estimator with 93 features.
Fitting estimator with 83 features.
Fitting estimator with 73 features.
Fitting estimator with 63 features.
Fitting estimator with 53 features.
0.8336381216359945


In [20]:
params = {
    "max_leaf_nodes": [124, 126, 128, 256, 512],
}
rf = RandomForestClassifier(class_weight='balanced', n_estimators=300, criterion='entropy', n_jobs=-1, min_samples_split=4, min_weight_fraction_leaf=0.0, max_depth=21)
grid = GridSearchCV(rf, params, verbose=1,cv=3,scoring='roc_auc', n_jobs=-1)
grid.fit(X,y)



Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.2min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=21, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=4, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_leaf_nodes': [124, 126, 128, 256, 512]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [21]:
print(grid.best_params_)


{'max_leaf_nodes': 512}


In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(class_weight='balanced', n_estimators=300, criterion='entropy', n_jobs=-1, min_samples_split=4, min_weight_fraction_leaf=0.0, max_depth=21, max_leaf_nodes=512)
rf.fit(X_train,y_train)
perm = PermutationImportance(rf).fit(X_test,y_test)


sel = SelectFromModel(perm, threshold=0.0001, prefit=True)
X_trans = sel.transform(X_train)
X_trans_test = sel.transform(X_test)
rf.fit(X_trans, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=21, max_features='auto',
            max_leaf_nodes=512, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=4, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [65]:
show_weights(perm, show_feature_values=True, top=100)

Weight,Feature
0.0289  ± 0.0017,x4
0.0137  ± 0.0036,x1
0.0066  ± 0.0028,x0
0.0040  ± 0.0014,x3
0.0036  ± 0.0025,x42
0.0024  ± 0.0009,x38
0.0023  ± 0.0013,x44
0.0022  ± 0.0012,x21
0.0017  ± 0.0009,x39
0.0016  ± 0.0006,x24


In [66]:
predictions = rf.predict(X_trans_test)
print(roc_auc_score(y_test, predictions))
rf_reg = RandomForestClassifier(class_weight='balanced', n_estimators=300, criterion='entropy', n_jobs=-1, min_samples_split=4, min_weight_fraction_leaf=0.0, max_depth=21, max_leaf_nodes=512)
rf_reg.fit(X_train,y_train)
predict_reg = rf_reg.predict(X_test)
print(roc_auc_score(y_test, predict_reg))

rec = RFE(rf_reg, n_features_to_select=50, step=0.1,verbose=1)
rec.fit(X_train,y_train)
rec_pred = rec.predict(X_test)
print(roc_auc_score(y_test,rec_pred))

0.7802657362941955
0.8327385825248976
Fitting estimator with 103 features.
Fitting estimator with 93 features.
Fitting estimator with 83 features.
Fitting estimator with 73 features.
Fitting estimator with 63 features.
Fitting estimator with 53 features.
0.8337072549714765


In [17]:
sub = pd.read_csv("example_submission.csv")
print(predictions.shape)
print(sub.shape)

sub['income'] = predictions
sub.to_csv("submit_results.csv", index=False)
print('done')

(45222,)
(45222, 2)
done
