## Load data

In [32]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

y2n = {
    0: 'setosa',
    1: 'versicolor',
    2: 'virginica'
}

X, y = load_iris(return_X_y=True, as_frame=True)
X = X.rename(columns={c: c.replace('(cm)', '').strip().replace(' ', '_') for c in X.columns})

X.shape, y.shape

((150, 4), (150,))

## Logistic regression

In [33]:
from sklearn.linear_model import LogisticRegression

L = LogisticRegression(n_jobs=-1, random_state=37, solver='saga', max_iter=10_000, class_weight='balanced')
L.fit(X, y)

LogisticRegression(class_weight='balanced', max_iter=10000, n_jobs=-1,
                   random_state=37, solver='saga')

In [42]:
pd.DataFrame([{**{'clazz': y2n[clazz], 'intercept': i}, **{col: coef for col, coef in zip(X.columns, coefs)}} 
              for clazz, (i, coefs) in enumerate(zip(L.intercept_, L.coef_))]) \
    .set_index(['clazz'])

Unnamed: 0_level_0,intercept,sepal_length,sepal_width,petal_length,petal_width
clazz,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
setosa,6.566801,-0.00305,1.211489,-2.44988,-1.08738
versicolor,3.064096,0.36919,-0.395907,-0.162812,-0.930556
virginica,-9.630898,-0.36614,-0.815582,2.612692,2.017936


## Random forest

In [53]:
from sklearn.ensemble import RandomForestClassifier

R = RandomForestClassifier(n_jobs=-1, random_state=37, n_estimators=3, class_weight='balanced')
R.fit(X, y)

RandomForestClassifier(class_weight='balanced', n_estimators=3, n_jobs=-1,
                       random_state=37)

In [54]:
pd.Series(R.feature_importances_, X.columns)

sepal_length    0.089231
sepal_width     0.009675
petal_length    0.667463
petal_width     0.233630
dtype: float64

## Best candidates, logistic regression

In [59]:
X.join(pd.DataFrame(L.predict_proba(X), columns=['setosa', 'versicolor', 'virginica'])) \
    .sort_values(['setosa', 'versicolor', 'virginica'], ascending=[False, True, True]) \
    [:1]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,setosa,versicolor,virginica
22,4.6,3.6,1.0,0.2,0.994824,0.005176,3.442988e-09


In [60]:
X.join(pd.DataFrame(L.predict_proba(X), columns=['setosa', 'versicolor', 'virginica'])) \
    .sort_values(['versicolor', 'setosa', 'virginica'], ascending=[False, True, True]) \
    [:1]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,setosa,versicolor,virginica
62,6.0,2.2,4.0,1.0,0.010792,0.971058,0.018149


In [61]:
X.join(pd.DataFrame(L.predict_proba(X), columns=['setosa', 'versicolor', 'virginica'])) \
    .sort_values(['virginica', 'setosa', 'versicolor'], ascending=[False, True, True]) \
    [:1]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,setosa,versicolor,virginica
118,7.7,2.6,6.9,2.3,1.838709e-08,0.001525,0.998475


## Best candidates, random forest

In [62]:
X.join(pd.DataFrame(R.predict_proba(X), columns=['setosa', 'versicolor', 'virginica'])) \
    .sort_values(['setosa', 'versicolor', 'virginica'], ascending=[False, True, True]) \
    [:1]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,setosa,versicolor,virginica
0,5.1,3.5,1.4,0.2,1.0,0.0,0.0


In [63]:
X.join(pd.DataFrame(R.predict_proba(X), columns=['setosa', 'versicolor', 'virginica'])) \
    .sort_values(['versicolor', 'setosa', 'virginica'], ascending=[False, True, True]) \
    [:1]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,setosa,versicolor,virginica
50,7.0,3.2,4.7,1.4,0.0,1.0,0.0


In [64]:
X.join(pd.DataFrame(R.predict_proba(X), columns=['setosa', 'versicolor', 'virginica'])) \
    .sort_values(['virginica', 'setosa', 'versicolor'], ascending=[False, True, True]) \
    [:1]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,setosa,versicolor,virginica
100,6.3,3.3,6.0,2.5,0.0,0.0,1.0


In [78]:
X.describe().loc[['min', 'max']]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
min,4.3,2.0,1.0,0.1
max,7.9,4.4,6.9,2.5


## Best candidate searching, logistic regression

In [164]:
import optuna

optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective(trial, m):
    sl = trial.suggest_float('sepal_length', 4.3, 7.9)
    sw = trial.suggest_float('sepal_width', 2.0, 4.4)
    pl = trial.suggest_float('petal_length', 1.0, 6.9)
    pw = trial.suggest_float('petal_width', 0.1, 2.5)
    
    _X = pd.DataFrame([[sl, sw, pl, pw]], columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
    _y = m.predict_proba(_X)[0]
    
    _y1 = _y[0]
    _y2 = _y[1]
    _y3 = _y[2]
    
    return _y1, _y2, _y3

def get_study(algo, clazz):
    if clazz == 0:
        directions = ['maximize', 'minimize', 'minimize']
    elif clazz == 1:
        directions = ['minimize', 'maximize', 'minimize']
    else:
        directions = ['minimize', 'minimize', 'maximize']
        
    study = optuna.create_study(**{
        'study_name': f'study-{algo}-{clazz:02}',
        'storage': f'sqlite:///_temp/opt-study.db',
        'load_if_exists': True,
        'directions': directions,
        'sampler': optuna.samplers.TPESampler(seed=37),
        'pruner': optuna.pruners.MedianPruner(n_warmup_steps=10)
    })
    
    return study

def get_best_trial(algo, clazz):
    study = get_study(algo, clazz)
    return max(study.best_trials, key=lambda t: (t.values[clazz]))

def get_best_params(algo, clazz):
    best = get_best_trial(algo, clazz)
    params = best.params
    params = {k: params[k] for k in ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']}
    return params

In [165]:
study = get_study('logistic', 0)
study.optimize(**{
    'func': lambda t: objective(t, L),
    'n_trials': 200,
    'n_jobs': 6,
    'show_progress_bar': True
})
best = max(study.best_trials, key=lambda t: (t.values[0]))
best.params, best.values

  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

({'sepal_length': 4.9310309063381785,
  'sepal_width': 4.396022809496658,
  'petal_length': 1.0052584086592715,
  'petal_width': 0.11658350842988516},
 [0.9983674109957078, 0.0016325885206006309, 4.836916060796055e-10])

In [166]:
study = get_study('logistic', 1)
study.optimize(**{
    'func': lambda t: objective(t, L),
    'n_trials': 200,
    'n_jobs': 6,
    'show_progress_bar': True
})
best = max(study.best_trials, key=lambda t: (t.values[1]))
best.params, best.values

  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

({'sepal_length': 7.709515249076063,
  'sepal_width': 2.2565776194939313,
  'petal_length': 4.471929903412942,
  'petal_width': 0.16444496616896156},
 [0.0024852339111455864, 0.9958836052066797, 0.0016311608821747631])

In [167]:
study = get_study('logistic', 2)
study.optimize(**{
    'func': lambda t: objective(t, L),
    'n_trials': 200,
    'n_jobs': 6,
    'show_progress_bar': True
})
best = max(study.best_trials, key=lambda t: (t.values[2]))
best.params, best.values

  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

({'sepal_length': 4.535880308530154,
  'sepal_width': 2.1491224675040774,
  'petal_length': 6.897154921009948,
  'petal_width': 2.4993082784301346},
 [1.2786075607675126e-09, 6.909423630690177e-05, 0.9999309044850855])

In [168]:
L.predict_proba(pd.DataFrame([get_best_params('logistic', c) for c in range(3)]))

array([[9.98367411e-01, 1.63258852e-03, 4.83691606e-10],
       [2.48523391e-03, 9.95883605e-01, 1.63116088e-03],
       [1.27860756e-09, 6.90942363e-05, 9.99930904e-01]])

In [169]:
R.predict_proba(pd.DataFrame([get_best_params('logistic', c) for c in range(3)]))

array([[1.        , 0.        , 0.        ],
       [0.33333333, 0.66666667, 0.        ],
       [0.        , 0.33333333, 0.66666667]])

## Best candidate searching, random forest

In [170]:
study = get_study('randomforest', 0)
study.optimize(**{
    'func': lambda t: objective(t, R),
    'n_trials': 200,
    'n_jobs': 6,
    'show_progress_bar': True
})
best = max(study.best_trials, key=lambda t: (t.values[0]))
best.params, best.values

  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

({'sepal_length': 4.82626887479423,
  'sepal_width': 4.191078237607879,
  'petal_length': 1.0558148586698979,
  'petal_width': 0.1491680343675136},
 [1.0, 0.0, 0.0])

In [171]:
study = get_study('randomforest', 1)
study.optimize(**{
    'func': lambda t: objective(t, R),
    'n_trials': 200,
    'n_jobs': 6,
    'show_progress_bar': True
})
best = max(study.best_trials, key=lambda t: (t.values[1]))
best.params, best.values

  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

({'sepal_length': 6.700297745575643,
  'sepal_width': 2.6927993134892745,
  'petal_length': 3.709505841815157,
  'petal_width': 1.9913641681930534},
 [0.0, 1.0, 0.0])

In [172]:
study = get_study('randomforest', 2)
study.optimize(**{
    'func': lambda t: objective(t, R),
    'n_trials': 200,
    'n_jobs': 6,
    'show_progress_bar': True
})
best = max(study.best_trials, key=lambda t: (t.values[2]))
best.params, best.values

  self._init_valid()


  0%|          | 0/200 [00:00<?, ?it/s]

({'sepal_length': 7.152365923578177,
  'sepal_width': 3.1660797160919487,
  'petal_length': 6.502452228699957,
  'petal_width': 2.2209283171404506},
 [0.0, 0.0, 1.0])

In [173]:
L.predict_proba(pd.DataFrame([get_best_params('randomforest', c) for c in range(3)]))

array([[9.97539157e-01, 2.46084202e-03, 1.08705105e-09],
       [2.84178539e-02, 9.03449061e-01, 6.81330856e-02],
       [4.52622748e-07, 4.90345392e-03, 9.95096093e-01]])

In [174]:
R.predict_proba(pd.DataFrame([get_best_params('randomforest', c) for c in range(3)]))

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [192]:
_X = pd.concat([
    pd.DataFrame([get_best_params('logistic', c) for c in range(3)], 
             index=pd.MultiIndex.from_tuples([('L', 'setosa'), ('L', 'versicolor'), ('L', 'virginica')], names=['algo', 'species'])),
    pd.DataFrame([get_best_params('randomforest', c) for c in range(3)], 
             index=pd.MultiIndex.from_tuples([('R', 'setosa'), ('R', 'versicolor'), ('R', 'virginica')], names=['algo', 'species']))
])

_Y = pd.DataFrame(L.predict_proba(_X), columns=['L_setosa', 'L_versicolor', 'L_virginica']) \
    .join(pd.DataFrame(R.predict_proba(_X), columns=['R_setosa', 'R_versicolor', 'R_virginica']))

_X.reset_index().join(_Y).set_index(['algo', 'species'])

Unnamed: 0_level_0,Unnamed: 1_level_0,sepal_length,sepal_width,petal_length,petal_width,L_setosa,L_versicolor,L_virginica,R_setosa,R_versicolor,R_virginica
algo,species,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
L,setosa,4.931031,4.396023,1.005258,0.116584,0.9983674,0.001633,4.836916e-10,1.0,0.0,0.0
L,versicolor,7.709515,2.256578,4.47193,0.164445,0.002485234,0.995884,0.001631161,0.333333,0.666667,0.0
L,virginica,4.53588,2.149122,6.897155,2.499308,1.278608e-09,6.9e-05,0.9999309,0.0,0.333333,0.666667
R,setosa,4.826269,4.191078,1.055815,0.149168,0.9975392,0.002461,1.087051e-09,1.0,0.0,0.0
R,versicolor,6.700298,2.692799,3.709506,1.991364,0.02841785,0.903449,0.06813309,0.0,1.0,0.0
R,virginica,7.152366,3.16608,6.502452,2.220928,4.526227e-07,0.004903,0.9950961,0.0,0.0,1.0
