In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from scipy.optimize import Bounds, minimize

In [2]:
# data from https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data

csv_col_names = [
    'Sample code number',
    'Clump Thickness',
    'Uniformity of Cell Size',
    'Uniformity of Cell Shape',
    'Marginal Adhesion ',
    'Single Epithelial Cell Size',
    'Bare Nuclei',
    'Bland Chromatin',
    'Normal Nucleoli',
    'Mitoses',
    'Class',
]
df = pd.read_csv('uci-breast-cancer.csv',
                 names=csv_col_names,
                 na_values='?')
df = df.dropna()
df.loc[df['Class'] == 2, 'Class'] = 0
df.loc[df['Class'] == 4, 'Class'] = 1
df.head()
# df.loc[df['Sample code number'] == 1096800] # row with missing value

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,0
1,1002945,5,4,4,5,7,10.0,3,2,1,0
2,1015425,3,1,1,1,2,2.0,3,1,1,0
3,1016277,6,8,8,1,3,4.0,3,7,1,0
4,1017023,4,1,1,3,2,1.0,3,1,1,0


In [3]:
df = df[df.columns.difference(['Sample code number'])]
X = df[df.columns.difference(['Class'])].values
y = df['Class'].values

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

In [55]:
hparam_configs = [
    {
        'name': 'n_estimators',
        'bound': [10, 1000],
        'dtype': int,
        'init': 10,
    },
    {
        'name': 'max_features',
        'bound': [0.1, 1],
        'dtype': float,
        'init': 1,
    },
    {
        'name': 'max_depth',
        'bound': [1, 100],
        'dtype': int,
        'init': 100,
    },
    {
        'name': 'min_samples_split',
        'bound': [2, 20],
        'dtype': int,
        'init': 2,
    },
    {
        'name': 'min_samples_leaf',
        'bound': [1, 20],
        'dtype': int,
        'init': 1,
    },
    {
        'name': 'min_weight_fraction_leaf',
        'bound': [0, 0.5],
        'dtype': float,
        'init': 0,
    },
    {
        'name': 'max_leaf_nodes',
        'bound': [10, 10000],
        'dtype': int,
        'init': 10000,
    },
    {
        'name': 'min_impurity_decrease',
        'bound': [0, 0.01],
        'dtype': float,
        'init': 0,
    },
    {
        'name': 'bootstrap',
        'bound': [0, 1],
        'dtype': bool,
        'init': 1,
    },
]

In [56]:
def test_rf(x):
    rf_kwargs = {}
    for param_val, config in zip(x, hparam_configs):
        sanitized_val = param_val
        if config['dtype'] == float:
            rf_kwargs[config['name']] = param_val
        elif config['dtype'] == bool:
            rf_kwargs[config['name']] = bool(np.round(param_val))
        elif config['dtype'] == int:
            rf_kwargs[config['name']] = int(np.round(param_val))

    rf = RandomForestClassifier(random_state=0, n_jobs=-1, **rf_kwargs)
    rf = rf.fit(X_train, y_train)
    return 1 - rf.score(X_test, y_test)

In [57]:
initial_hparams = [config['init'] for config in hparam_configs]
hparam_bounds = [config['bound'] for config in hparam_configs]
res = minimize(fun=test_rf,
               x0=initial_hparams,
               bounds=hparam_bounds)
print(res)

      fun: 0.06341463414634141
 hess_inv: <1x1 LbfgsInvHessProduct with dtype=float64>
      jac: array([0.])
  message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
     nfev: 2
      nit: 0
   status: 0
  success: True
        x: array([10.])


In [40]:
initial_hparams = [config['init'] for config in hparam_configs]
hparam_bounds = [config['bound'] for config in hparam_configs]
res = minimize(fun=test_rf,
               x0=initial_hparams,
               method='TNC',
               bounds=hparam_bounds,
               tol=1e-10,
               options={'disp': True})
print(res)

     fun: 0.06829268292682922
     jac: array([0., 0., 0., 0., 0., 0., 0., 0.])
 message: 'Local minimum reached (|pg| ~= 0)'
    nfev: 1
     nit: 0
  status: 0
 success: True
       x: array([ 10.,   1., 100.,   2.,   1.,   0.,   0.,   1.])


In [36]:
initial_hparams = [config['init'] for config in hparam_configs]
hparam_bounds = [config['bound'] for config in hparam_configs]
res = minimize(fun=test_rf,
               x0=initial_hparams,
               method='L-BFGS-B',
               bounds=hparam_bounds,
               options={'disp': True, 'maxcor': 1000, 'ftol': 2.220446049250313e-09, 'gtol': 1e-10, 'eps': 1e-10, 'maxfun': 15000, 'maxiter': 15000, 'iprint': -1, 'maxls': 20})
print(res)

      fun: 0.9317073170731708
 hess_inv: <9x9 LbfgsInvHessProduct with dtype=float64>
      jac: array([0., 0., 0., 0., 0., 0., 0., 0., 0.])
  message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
     nfev: 10
      nit: 0
   status: 0
  success: True
        x: array([1.e+01, 1.e+00, 1.e+02, 2.e+00, 1.e+00, 0.e+00, 1.e+04, 0.e+00,
       1.e+00])


In [54]:
test_rf([10, 0.5])

0.06341463414634141