# RandomForest without Optuna

In [1]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', cache=False)

In [2]:
X = mnist.data.astype('float32')
y = mnist.target.astype('int64')

X /= 255.0
print(X.min(), X.max())

pixel1      0.0
pixel2      0.0
pixel3      0.0
pixel4      0.0
pixel5      0.0
           ... 
pixel780    0.0
pixel781    0.0
pixel782    0.0
pixel783    0.0
pixel784    0.0
Length: 784, dtype: float64 pixel1      0.000000
pixel2      0.000000
pixel3      0.000000
pixel4      0.000000
pixel5      0.000000
              ...   
pixel780    0.243137
pixel781    0.000000
pixel782    0.000000
pixel783    0.000000
pixel784    0.000000
Length: 784, dtype: float64


In [3]:
X = X[:10000]
y = y[:10000]

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(accuracy_score(y_train, classifier.predict(X_train)))
print(accuracy_score(y_test, y_pred))

1.0
0.9496


# RandomForest using Optuna

In [6]:
!pip install optuna

Collecting optuna
  Downloading optuna-2.10.0-py3-none-any.whl (308 kB)
[?25l[K     |█                               | 10 kB 19.9 MB/s eta 0:00:01[K     |██▏                             | 20 kB 22.7 MB/s eta 0:00:01[K     |███▏                            | 30 kB 10.9 MB/s eta 0:00:01[K     |████▎                           | 40 kB 4.6 MB/s eta 0:00:01[K     |█████▎                          | 51 kB 4.6 MB/s eta 0:00:01[K     |██████▍                         | 61 kB 5.4 MB/s eta 0:00:01[K     |███████▍                        | 71 kB 5.5 MB/s eta 0:00:01[K     |████████▌                       | 81 kB 4.3 MB/s eta 0:00:01[K     |█████████▋                      | 92 kB 4.8 MB/s eta 0:00:01[K     |██████████▋                     | 102 kB 5.3 MB/s eta 0:00:01[K     |███████████▊                    | 112 kB 5.3 MB/s eta 0:00:01[K     |████████████▊                   | 122 kB 5.3 MB/s eta 0:00:01[K     |█████████████▉                  | 133 kB 5.3 MB/s eta 0:00:01[K 

In [10]:
import optuna 
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

def objectiveRF(trial: Trial, X, y):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 700, 3000),
        'max_depth': trial.suggest_int('max_depth', 4, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 150),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 60),
    }
    
    model = RandomForestClassifier(**param)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)    
    model.fit(X_train,y_train,
        #verbose=False,
    )

    y_pred = model.predict(X_test)

    score = accuracy_score(y_test, y_pred)

    return score

In [11]:
# direction : 목표 값을 최대화할 것인지, 최소화할 것인지에 대한 설정
study = optuna.create_study(direction='maximize',sampler=TPESampler())

# n_trials : 시도 횟수
study.optimize(lambda trial : objectiveRF(trial, X,  y), n_trials=50)
print(f'Best trial: score {study.best_trial.value},\nparams {study.best_trial.params}')

[32m[I 2022-04-24 06:37:42,693][0m A new study created in memory with name: no-name-2e6f0528-f0a6-4670-ab86-e12ac271ed4d[0m
[32m[I 2022-04-24 06:38:08,591][0m Trial 0 finished with value: 0.9116 and parameters: {'n_estimators': 1213, 'max_depth': 48, 'min_samples_split': 114, 'min_samples_leaf': 16}. Best is trial 0 with value: 0.9116.[0m
[32m[I 2022-04-24 06:38:57,286][0m Trial 1 finished with value: 0.8988 and parameters: {'n_estimators': 2560, 'max_depth': 24, 'min_samples_split': 134, 'min_samples_leaf': 31}. Best is trial 0 with value: 0.9116.[0m
[32m[I 2022-04-24 06:39:49,365][0m Trial 2 finished with value: 0.8972 and parameters: {'n_estimators': 2796, 'max_depth': 23, 'min_samples_split': 46, 'min_samples_leaf': 51}. Best is trial 0 with value: 0.9116.[0m
[32m[I 2022-04-24 06:40:29,374][0m Trial 3 finished with value: 0.922 and parameters: {'n_estimators': 1726, 'max_depth': 22, 'min_samples_split': 80, 'min_samples_leaf': 6}. Best is trial 3 with value: 0.922.[0

Best trial: score 0.956,
params {'n_estimators': 1286, 'max_depth': 43, 'min_samples_split': 3, 'min_samples_leaf': 1}


In [12]:
classifier = RandomForestClassifier(**study.best_params)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(accuracy_score(y_train, classifier.predict(X_train)))
print(accuracy_score(y_test, y_pred))

1.0
0.9552


In [13]:
# 하이퍼파라미터별 중요도를 확인할 수 있는 그래프
optuna.visualization.plot_param_importances(study)


In [14]:
# 하이퍼파라미터 최적화 과정을 확인
optuna.visualization.plot_optimization_history(study)