# Adult Data

## Load dependencies

In [81]:
import joblib
import os

from optuna.integration.sklearn import OptunaSearchCV
from optuna.distributions import UniformDistribution, LogUniformDistribution, IntUniformDistribution, IntLogUniformDistribution, CategoricalDistribution

from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [104]:
import warnings
from optuna.exceptions import ExperimentalWarning
warnings.filterwarnings('ignore', category=ExperimentalWarning)
warnings.filterwarnings('ignore', category=FutureWarning, module='sklearn')

In [95]:
MODEL_DIR = os.path.join(os.path.abspath('.'), '..', 'models')

## Load data

In [84]:
%run ../data/data.py

In [85]:
adult, adult_test = load_adult_data()

In [86]:
y = adult['Income']
X = adult.drop(columns='Income')
y_test = adult_test['Income']
X_test = adult_test.drop(columns='Income')

## Hyperparameter Tuning

For hyperparameter tuning, we use the `OptunaSearchCV` implementation in the `optuna` package. The interface is exactly the same as `RandomSearchCV` in `scikit-learn` with the advantage of a much smarter optimization algorithm.

## Model calibration

In [87]:
%run utils.py

In [88]:
feature_pipeline = default_pipeline(X)

### Logistic Regression

In [41]:
logreg = LogisticRegressionCV(max_iter=1e5)

In [42]:
model = Pipeline([
    ('features', feature_pipeline),
    ('model', logreg)
])

In [43]:
_ = model.fit(X, y)

In [52]:
setattr(model, 'training_data', (X, y))

In [58]:
_ = joblib.dump(model, os.path.join(MODEL_DIR, 'adult_logreg.pkl'))

['/home/philipp/projects/xai/notebooks/../models/adult_logreg.pkl']

### Support Vector Machine

In [89]:
svm = SVC()

In [102]:
model = Pipeline([
    ('features', feature_pipeline),
    ('model', svm)
])

In [103]:
model.fit(X, y)

Pipeline(steps=[('features',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler())]),
                                                  Index(['Age', 'Final Weight', 'Years of Education', 'Capital Gain',
       'Capital Loss', 'Hours per Week'],
      dtype='object')),
                                                 ('low_cardinality',
                                                  Pipeline(steps=[('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['Workclass', 'Marital Status', 'Relationship', 'Race', 'Sex'], dtype='object')),
                                                 ('high_cardinality',
                      

In [108]:
param_distributions = {
    'C': LogUniformDistribution(1e-4, 1e4),
    'kernel': CategoricalDistribution(['linear', 'poly', 'rbf', 'sigmoid']),
    'class_weight': CategoricalDistribution([None, 'balanced']),
    'degree': IntUniformDistribution(3, 7)
}

In [109]:
param_distributions = {'model__'+k: v for k,v in param_distributions.items()}

In [110]:
model_search = OptunaSearchCV(model, param_distributions, n_iter=20*len(param_distributions), n_jobs=-1)

In [107]:
model_search.fit(X, y)

[32m[I 2021-05-24 22:11:25,856][0m A new study created in memory with name: no-name-aa36f4c5-603e-4cfa-a7be-66049cc71ed5[0m
[33m[W 2021-05-24 22:11:25,882][0m Trial 0 failed because of the following error: ValueError("Invalid parameter C for estimator Pipeline(steps=[('features',\n                 ColumnTransformer(remainder='passthrough',\n                                   transformers=[('numeric',\n                                                  Pipeline(steps=[('standardscaler',\n                                                                   StandardScaler())]),\n                                                  Index(['Age', 'Final Weight', 'Years of Education', 'Capital Gain',\n       'Capital Loss', 'Hours per Week'],\n      dtype='object')),\n                                                 ('low_cardinality',\n                                                  Pipeline(steps=[('onehotencoder',\n                                                                   OneHot

ValueError: Invalid parameter C for estimator Pipeline(steps=[('features',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler())]),
                                                  Index(['Age', 'Final Weight', 'Years of Education', 'Capital Gain',
       'Capital Loss', 'Hours per Week'],
      dtype='object')),
                                                 ('low_cardinality',
                                                  Pipeline(steps=[('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['Workclass', 'Marital Status', 'Relationship', 'Race', 'Sex'], dtype='object')),
                                                 ('high_cardinality',
                                                  Pipeline(steps=[('hashingencoder',
                                                                   HashingEncoder(max_process=6,
                                                                                  return_df=False))]),
                                                  Index(['Education', 'Occupation', 'Native Country'], dtype='object'))])),
                ('model', SVC())]). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
adult_svm = model_search.best_estimator_

In [None]:
setattr(adult_svm, 'training_data', (X, y))

In [None]:
joblib.dump(os.path.join(MODEL_DIR, 'adult_svm.pkl'))

### Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
rf = RandomForestClassifier()

In [21]:
param_distributions = {
    'criterion': CategoricalDistribution(['gini', 'entropy']),
    'max_depth': CategoricalDistribution([None, 3, 4, 5, 7, 10]),
    'min_samples_split': IntUniformDistribution(2, 10),
    'min_samples_leaf': IntUniformDistribution(1, 10),
    'max_features': CategoricalDistribution(['auto', 'sqrt', 'log2']),
    'min_impurity_decrease': UniformDistribution(0, 1),
    'class_weight': CategoricalDistribution([None, 'balanced', 'balanced_subsample']),
}

In [22]:
adult

Unnamed: 0,Age,Workclass,Final Weight,Education,Years of Education,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per Week,Native Country,Income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32556,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32557,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32558,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [17]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### Extra Trees Classifier

### Gradient Boosted Decision Trees

### XGBoost

### LightGBM

### CatBoost

### MLPClassifier