## ensemble(RandomForest)

In [15]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np

cancer = load_breast_cancer()
cancer_data = cancer['data']
cancer_target = cancer['target']

X_train, X_test, y_train, y_test = train_test_split(cancer_data, cancer_target, test_size=0.3, random_state=11)

model = LogisticRegression(solver='liblinear')
model1 = DecisionTreeClassifier(max_depth=5)
model2 = RandomForestClassifier(random_state = 0, max_depth=4, n_estimators=100)
model.fit(X_train, y_train)
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred1 = model1.predict(X_test)
y_pred2 = model2.predict(X_test)
print('--------로지스틱 회귀 분석--------')
print('훈련셋 : ', model.score(X_train, y_train))
print('테스트셋 : ', model.score(X_test, y_test))
print('--------의사결정 트리 분류--------')
print('훈련셋 : ', model1.score(X_train, y_train))
print('테스트셋 : ', model1.score(X_test, y_test))
print('--------랜덤 포레스트 분류--------')
print('훈련셋 : ', model2.score(X_train, y_train))
print('테스트셋 : ', model2.score(X_test, y_test))

--------로지스틱 회귀 분석--------
훈련셋 :  0.9673366834170855
테스트셋 :  0.935672514619883
--------의사결정 트리 분류--------
훈련셋 :  0.9899497487437185
테스트셋 :  0.9473684210526315
--------랜덤 포레스트 분류--------
훈련셋 :  0.9899497487437185
테스트셋 :  0.9532163742690059


## RandomForest HyperParameter Tuning
- GridSearchSV를 활용해서 parameters를 주고 최적의 파라미터와 최고 정확도를 찾는다

In [19]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'max_depth':[8, 16, 24],
    'min_samples_leaf':[1, 6, 12],
    'min_samples_split':[2, 8, 16],
    'n_estimators':[10, 50, 100]
}
# n_jobs : 시스템 내의 프로세서(cpu를 최대한으로 써라.) -1 : 자동 크기설정
model = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(model, param_grid=parameters, cv=2, n_jobs=-1)
grid_cv.fit(X_train, y_train)

print('-------------------------------------------')
print('최적의 파라미터 : ', grid_cv.best_estimator_)
print('최고 예측 정확도 : ', grid_cv.best_score_)

-------------------------------------------
최적의 파라미터 :  RandomForestClassifier(max_depth=8, n_estimators=50, n_jobs=-1, random_state=0)
최고 예측 정확도 :  0.9547738693467336
