In [154]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn_pandas import CategoricalImputer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

In [155]:
train = pd.read_csv("./census.csv")
test = pd.read_csv("./test_census.csv")

In [156]:
# numerical
num_cols = ['age', 'education-num', 'capital-gain',
            'capital-loss', 'hours-per-week']

# categorical
cat_cols = ['workclass', 'education_level', 
            'marital-status', 'occupation', 
            'relationship', 'race', 
            'sex', 'native-country']

# need log transform
log_transform_cols = ['capital-loss', 'capital-gain']

In [157]:
minmax = MinMaxScaler()
onehot = OneHotEncoder(sparse=False)
cat = CategoricalImputer()
simp = SimpleImputer()

In [158]:
X_num = simp.fit_transform(train[num_cols].values)
X_cat = cat.fit_transform(train[cat_cols].values)
X_cat = onehot.fit_transform(X_cat)
X_log = simp.fit_transform(train[log_transform_cols].values)
X_log = np.log1p(X_log)
X_num = minmax.fit_transform(X_num)
X_log = minmax.fit_transform(X_log)

test_num = simp.fit_transform(test[num_cols].values)
test_cat = cat.fit_transform(test[cat_cols].values)
test_cat = onehot.fit_transform(test_cat)
test_log = simp.fit_transform(test[log_transform_cols].values)
test_log = np.log1p(test_log)
test_num = minmax.fit_transform(test_num)
test_log = minmax.fit_transform(test_log)

In [159]:
X = np.concatenate((X_num,X_cat,X_log), axis=1)
test = np.concatenate((test_num, test_cat, test_log), axis=1)
y = train['income'].map({'<=50K': 0, '>50K': 1})

In [160]:
param_grid = {
    "C": [0.1, 0.3, 0.5, 0.7, 0.9, 1, 2, 3, 4, 5, 10],
    "penalty": ['l1', 'l2', 'none', 'elasticnet'],
    "solver": ["liblinear", "saga"]
}
linear = LogisticRegression(class_weight="balanced", n_jobs=-1, max_iter=1000)
search = GridSearchCV(estimator=linear, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

In [161]:
search.fit(X,y)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                          dual=False, fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=1000, multi_class='warn',
                                          n_jobs=-1, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1,
                               2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'penalty': ['l1', 'l2'],
                         'solver': ['liblinear', 'saga']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             sco

In [162]:
print("AUC: "+str(search.best_score_))
print(search.best_params_)

AUC: 0.9070551203393995
{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}


In [163]:
params = {
    'application': 'binary', # for binary classification
#     'num_class' : 1, # used for multi-classes
    'boosting': 'gbdt', # traditional gradient boosting decision tree
    'num_iterations': 100, 
    'learning_rate': 0.05,
    'num_leaves': 62,
    'device': 'cpu', # you can use GPU to achieve faster learning
    'max_depth': -1, # <0 means no limit
    'max_bin': 510, # Small number of bins may reduce training accuracy but can deal with over-fitting
    'lambda_l1': 5, # L1 regularization
    'lambda_l2': 10, # L2 regularization
    'metric' : 'binary_error',
    'subsample_for_bin': 200, # number of samples for constructing bins
    'subsample': 1, # subsample ratio of the training instance
    'colsample_bytree': 0.8, # subsample ratio of columns when constructing the tree
    'min_split_gain': 0.5, # minimum loss reduction required to make further partition on a leaf node of the tree
    'min_child_weight': 1, # minimum sum of instance weight (hessian) needed in a leaf
    'min_child_samples': 5# minimum number of data needed in a leaf
}

# Initiate classifier to use
mdl = lgb.LGBMClassifier(boosting_type= 'gbdt', 
          objective = 'binary', 
          n_jobs = 5, 
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'], 
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'], 
          min_split_gain = params['min_split_gain'], 
          min_child_weight = params['min_child_weight'], 
          min_child_samples = params['min_child_samples'])

# To view the default model parameters:
mdl.get_params().keys()

dict_keys(['boosting_type', 'class_weight', 'colsample_bytree', 'importance_type', 'learning_rate', 'max_depth', 'min_child_samples', 'min_child_weight', 'min_split_gain', 'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin', 'subsample_freq', 'max_bin'])

In [None]:
param_grid = {
    'learning_rate': [0.005, 0.01, 0.1],
    'n_estimators': [8,16,24, 100, 200, 300, 500, 1000],
    'num_leaves': [6,8,12,16,32], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
    'objective' : ['binary'],
    'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [500],
    'colsample_bytree' : [0.64, 0.65, 0.66],
    'subsample' : [0.75, 1],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    'lambda_l1' : [1,2,3,5,10],
    'lambda_l2' : [1,2,3,5,10],
    'min_split_gain' : [0.2,0.5,0.8],
    'min_child_samples': [2,3,4,5,6,10]
    }

grid = GridSearchCV(mdl, param_grid, verbose=1, cv=5, scoring="roc_auc", n_jobs=-1)
# Run the grid
grid.fit(X, y)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

Fitting 4 folds for each of 3456 candidates, totalling 13824 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 438 tasks      | elapsed: 31.6min
