# Adult Data

## Load dependencies

In [1]:
import joblib
import os

from optuna.integration.sklearn import OptunaSearchCV
from optuna.distributions import UniformDistribution, LogUniformDistribution, IntUniformDistribution, IntLogUniformDistribution, CategoricalDistribution

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [2]:
import warnings
from optuna.exceptions import ExperimentalWarning
warnings.filterwarnings('ignore', category=ExperimentalWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

In [3]:
MODEL_DIR = os.path.join(os.path.abspath('.'), '..', 'models')

## Load data

In [4]:
%run ../data/data.py

In [5]:
adult, adult_test = load_adult_data()

In [6]:
y = adult['Income']
X = adult.drop(columns='Income')
y_test = adult_test['Income']
X_test = adult_test.drop(columns='Income')

## Hyperparameter Tuning

For hyperparameter tuning, we use the `OptunaSearchCV` implementation in the `optuna` package. The interface is exactly the same as `RandomSearchCV` in `scikit-learn` with the advantage of a much smarter optimization algorithm.

## Model calibration

In [23]:
%run utils.py

### Logistic Regression

In [15]:
logreg = LogisticRegressionCV(max_iter=1e5)

In [17]:
model = fit_model(logreg, X, y)

In [13]:
_ = joblib.dump(model, os.path.join(MODEL_DIR, 'adult_logreg.pkl'))

### Support Vector Machine

In [20]:
svm = SVC()

In [21]:
param_distributions = {
    'C': LogUniformDistribution(1e-4, 1e4),
    'kernel': CategoricalDistribution(['linear', 'poly', 'rbf', 'sigmoid']),
    'class_weight': CategoricalDistribution([None, 'balanced']),
    'degree': IntUniformDistribution(3, 7)
}

In [22]:
model = fit_model(svm, X, y, param_distributions)

TypeError: len() takes no keyword arguments

In [None]:
joblib.dump(os.path.join(MODEL_DIR, 'adult_svm.pkl'))

In [None]:
classification_report(y, model.predict(X))

In [15]:
model = Pipeline([
    ('features', feature_pipeline),
    ('model', svm)
])

In [109]:
param_distributions = {'model__'+k: v for k,v in param_distributions.items()}

In [110]:
model_search = OptunaSearchCV(model, param_distributions, n_iter=20*len(param_distributions), n_jobs=-1)

In [107]:
model_search.fit(X, y)

[32m[I 2021-05-24 22:11:25,856][0m A new study created in memory with name: no-name-aa36f4c5-603e-4cfa-a7be-66049cc71ed5[0m
[33m[W 2021-05-24 22:11:25,882][0m Trial 0 failed because of the following error: ValueError("Invalid parameter C for estimator Pipeline(steps=[('features',\n                 ColumnTransformer(remainder='passthrough',\n                                   transformers=[('numeric',\n                                                  Pipeline(steps=[('standardscaler',\n                                                                   StandardScaler())]),\n                                                  Index(['Age', 'Final Weight', 'Years of Education', 'Capital Gain',\n       'Capital Loss', 'Hours per Week'],\n      dtype='object')),\n                                                 ('low_cardinality',\n                                                  Pipeline(steps=[('onehotencoder',\n                                                                   OneHot

ValueError: Invalid parameter C for estimator Pipeline(steps=[('features',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler())]),
                                                  Index(['Age', 'Final Weight', 'Years of Education', 'Capital Gain',
       'Capital Loss', 'Hours per Week'],
      dtype='object')),
                                                 ('low_cardinality',
                                                  Pipeline(steps=[('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['Workclass', 'Marital Status', 'Relationship', 'Race', 'Sex'], dtype='object')),
                                                 ('high_cardinality',
                                                  Pipeline(steps=[('hashingencoder',
                                                                   HashingEncoder(max_process=6,
                                                                                  return_df=False))]),
                                                  Index(['Education', 'Occupation', 'Native Country'], dtype='object'))])),
                ('model', SVC())]). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
adult_svm = model_search.best_estimator_

In [None]:
setattr(adult_svm, 'training_data', (X, y))

### Random Forest

In [None]:
rf = RandomForestClassifier()
param_distributions = {
    'criterion': CategoricalDistribution(['gini', 'entropy']),
    'max_depth': CategoricalDistribution([None, 3, 4, 5, 7, 10]),
    'min_samples_split': IntUniformDistribution(2, 10),
    'min_samples_leaf': IntUniformDistribution(1, 10),
    'max_features': CategoricalDistribution(['auto', 'sqrt', 'log2']),
    'min_impurity_decrease': UniformDistribution(0, 1),
    'class_weight': CategoricalDistribution([None, 'balanced', 'balanced_subsample']),
}

In [None]:
adult_rf = fit_model(rf, param_distributions, X, y)

In [26]:
joblib.dump(adult_rf, os.path.join(MODEL_DIR, 'adult_rf.pkl'))

['/home/philipp/projects/xai/notebooks/../models/adult_rf.pkl']

### Extra Trees Classifier

In [30]:
et = ExtraTreesClassifier()
param_distributions = {
    'criterion': CategoricalDistribution(['gini', 'entropy']),
    'max_depth': CategoricalDistribution([None, 3, 4, 5, 7, 10]),
    'min_samples_split': IntUniformDistribution(2, 10),
    'min_samples_leaf': IntUniformDistribution(1, 10),
    'max_features': CategoricalDistribution(['auto', 'sqrt', 'log2']),
    'min_impurity_decrease': UniformDistribution(0, 1),
    'class_weight': CategoricalDistribution([None, 'balanced', 'balanced_subsample']),
}

In [28]:
adult_et = fit_model(et, param_distributions, X, y)

NameError: name 'fit_model' is not defined

In [None]:
joblib.dump(adult_et, os.path.join(MODEL_DIR, 'adult_rf.pkl'))

### Gradient Boosted Decision Trees

In [None]:
gbm = GradientBoostingClassifier()
param_distributions = {
    'loss': CategoricalDistribution(['deviance', 'exponential']),
    'learning_rate': UniformDistribution(1e-4,1.0),
    'n_estimators': IntUniformDistribution(10, 1000),
    'subsample': UniformDistribution(0.1, 1.0),
    'min_samples_split': IntUniformDistribution(2, 10),
    'min_samples_leaf': IntUniformDistribution(1, 10),
    'min_impurity_decrease': UniformDistribution(0, 1),
    'max_features': CategoricalDistribution(['auto', 'sqrt', 'log2'])
}

In [None]:
adult_gbm = fit_model(gbm, param_distributions, X, y)

In [None]:
joblib.dump(adult_gbm, os.path.join(MODEL_DIR, 'adult_gbm.pkl'))

### XGBoost

### LightGBM

### CatBoost

### MLPClassifier