## 의사결정나무

In [39]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
data = pd.read_csv("breast-cancer-wisconsin.csv", encoding = 'utf-8')

In [40]:
data.head()

Unnamed: 0,code,Clump_Thickness,Cell_Size,Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,0
1,1002945,5,4,4,5,7,10,3,2,1,0
2,1015425,3,1,1,1,2,2,3,1,1,0
3,1016277,6,8,8,1,3,4,3,7,1,0
4,1017023,4,1,1,3,2,1,3,1,1,0


In [41]:
x = data[data.columns[1:10]]
y = data[['Class']]

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x, y, stratify = y, random_state = 42)

In [43]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_scaled_train = scaler.transform(X_train)
X_scaled_test = scaler.transform(X_test)

**분류**

In [44]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_scaled_train, Y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, Y_train)

1.0

In [45]:
from sklearn.metrics import confusion_matrix
confusion_train = confusion_matrix(Y_train, pred_train)
print("훈련데이터 오차행렬 : \n", confusion_train)

훈련데이터 오차행렬 : 
 [[333   0]
 [  0 179]]


In [46]:
from sklearn.metrics import classification_report
cfreport_train = classification_report(Y_train, pred_train)
print("분류예측 레포트 : \n", cfreport_train)

분류예측 레포트 : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       333
           1       1.00      1.00      1.00       179

    accuracy                           1.00       512
   macro avg       1.00      1.00      1.00       512
weighted avg       1.00      1.00      1.00       512



In [47]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, Y_test)

0.9532163742690059

In [48]:
confusion_test = confusion_matrix(Y_test, pred_test)
print("테스트데이터 오차행렬 : \n", confusion_test)

테스트데이터 오차행렬 : 
 [[105   6]
 [  2  58]]


In [49]:
cfreport_test = classification_report(Y_test, pred_test)
print("분류예측 레포트 : \n", cfreport_test)

분류예측 레포트 : 
               precision    recall  f1-score   support

           0       0.98      0.95      0.96       111
           1       0.91      0.97      0.94        60

    accuracy                           0.95       171
   macro avg       0.94      0.96      0.95       171
weighted avg       0.95      0.95      0.95       171



그리드서치

In [50]:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth' : range(2,20,2), 'min_samples_leaf' : range(1,50,2)}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5)
grid_search.fit(X_scaled_train, Y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(2, 20, 2),
                         'min_samples_leaf': range(1, 50, 2)})

In [51]:
print("best parameter : {}".format(grid_search.best_params_))
print("best score : {:.4f}".format(grid_search.best_score_))
print("test set score : {:.4f}".format(grid_search.score(X_scaled_test, Y_test)))

best parameter : {'max_depth': 12, 'min_samples_leaf': 1}
best score : 0.9647
test set score : 0.9532


랜덤 서치

In [52]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {'max_depth' : randint(low = 1, high = 20), 'min_samples_leaf' : randint(low = 1, high = 50)}
random_search = RandomizedSearchCV(DecisionTreeClassifier(), param_distribs, n_iter = 20, cv = 5)
random_search.fit(X_scaled_train, Y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_iter=20,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000144180928B0>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000014418083730>})

In [53]:
print("best parameter : {}".format(random_search.best_params_))
print("best score : {:.4f}".format(random_search.best_score_))
print("test set score : {:.4f}".format(random_search.score(X_scaled_test, Y_test)))

best parameter : {'max_depth': 18, 'min_samples_leaf': 6}
best score : 0.9492
test set score : 0.9591


**회귀**

In [54]:
data2 = pd.read_csv('house_price.csv', encoding = 'utf-8')
data2.head()

Unnamed: 0,housing_age,income,bedrooms,households,rooms,house_value
0,23,6.777,0.141112,2.442244,8.10396,500000
1,49,6.0199,0.160984,2.726688,5.752412,500000
2,35,5.1155,0.249061,1.902676,3.888078,500000
3,32,4.7109,0.231383,1.913669,4.508393,500000
4,21,4.5625,0.255583,3.092664,4.667954,500000


In [55]:
x = data2[data2.columns[1:5]]
y = data2[['house_value']]

In [56]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42)

In [57]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train)
x_scaled_train = scaler.transform(x_train)
x_scaled_test = scaler.transform(x_test)

In [58]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(x_scaled_train, y_train)
pred_train = model.predict(x_scaled_train)
model.score(x_scaled_train, y_train)

1.0

In [59]:
pred_test= model.predict(x_scaled_test)
model.score(x_scaled_test, y_test)

0.2259193113881075

In [60]:
import numpy as np
from sklearn.metrics import mean_squared_error
MSE_train = mean_squared_error(y_train, pred_train)
MSE_test = mean_squared_error(y_test, pred_test)
print("훈련데이터 RMSE : ", np.sqrt(MSE_train))
print("평가데이터 RMSE : ", np.sqrt(MSE_test))

훈련데이터 RMSE :  0.0
평가데이터 RMSE :  84111.54113451815


그리드 서치

In [61]:
param_grid = {'max_depth' : range(2, 20, 2), 'min_samples_leaf' : range(1, 50, 2)}
grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid, cv = 5)
grid_search.fit(x_scaled_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': range(2, 20, 2),
                         'min_samples_leaf': range(1, 50, 2)})

In [62]:
print("best parameter : {}".format(grid_search.best_params_))
print("best score : {:.4f}".format(grid_search.best_score_))
print("test set score : {:.4f}".format(grid_search.score(x_scaled_train, y_train)))

best parameter : {'max_depth': 8, 'min_samples_leaf': 49}
best score : 0.5592
test set score : 0.6078


랜덤 서치

In [63]:
param_distribs = {'max_depth' : randint(low = 1, high = 20), 'min_samples_leaf' : randint(low = 1, high = 50)}
random_search = RandomizedSearchCV(DecisionTreeRegressor(), param_distribs, n_iter = 20, cv = 5)
random_search.fit(x_scaled_train, y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_iter=20,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000144180DA4C0>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000144180DA850>})

In [64]:
print("best parameter : {}".format(random_search.best_params_))
print("best score : {:.4f}".format(random_search.best_score_))
print("test set score : {:.4f}".format(random_search.score(x_scaled_test, y_test)))

best parameter : {'max_depth': 16, 'min_samples_leaf': 49}
best score : 0.5586
test set score : 0.5767


## 랜덤포레스트

**분류**

In [66]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_scaled_train, Y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, Y_train)

1.0

In [67]:
from sklearn.metrics import confusion_matrix
confusion_train = confusion_matrix(Y_train, pred_train)
print("훈련데이터 오차행렬 : \n", confusion_train)

훈련데이터 오차행렬 : 
 [[333   0]
 [  0 179]]


In [68]:
from sklearn.metrics import classification_report
cfreport_train = classification_report(Y_train, pred_train)
print("분류예측 레포트 : \n", cfreport_train)

분류예측 레포트 : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       333
           1       1.00      1.00      1.00       179

    accuracy                           1.00       512
   macro avg       1.00      1.00      1.00       512
weighted avg       1.00      1.00      1.00       512



In [70]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, Y_test)

0.9649122807017544

In [71]:
confusion_test = confusion_matrix(Y_test, pred_test)
print("평가데이터 오차행렬 : \n", confusion_test)

평가데이터 오차행렬 : 
 [[106   5]
 [  1  59]]


In [72]:
cfreport_test = classification_report(Y_test, pred_test)
print("분류예측 레포트 : \n", cfreport_test)

분류예측 레포트 : 
               precision    recall  f1-score   support

           0       0.99      0.95      0.97       111
           1       0.92      0.98      0.95        60

    accuracy                           0.96       171
   macro avg       0.96      0.97      0.96       171
weighted avg       0.97      0.96      0.97       171



그리드 서치

In [73]:
param_grid = {'n_estimators' : range(100, 1000, 100), 'max_features' : ['auto', 'log2']}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv = 5)
grid_search.fit(X_scaled_train, Y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_features': ['auto', 'log2'],
                         'n_estimators': range(100, 1000, 100)})

In [74]:
print("best parameter : {}".format(grid_search.best_params_))
print("best score : {:.4f}".format(grid_search.best_score_))
print("test set score : {:.4f}".format(grid_search.score(X_scaled_test, Y_test)))

best parameter : {'max_features': 'auto', 'n_estimators': 400}
best score : 0.9746
test set score : 0.9649


랜덤 서치

In [76]:
param_distribs = {'n_estimators' : randint(low = 100, high = 1000), 'max_features' : ['auto', 'log2']}
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distribs, n_iter = 20, cv = 5)
random_search.fit(X_scaled_train, Y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=20,
                   param_distributions={'max_features': ['auto', 'log2'],
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000014419307160>})

In [81]:
print("best parameter : {}".format(random_search.best_params_))
print("best score : {:.4f}".format(random_search.best_score_))
print("test set score : {:.4f}".format(random_search.score(X_scaled_test, Y_test)))

best parameter : {'max_features': 'auto', 'n_estimators': 677}
best score : 0.9746
test set score : 0.9649


**회귀**

In [78]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(x_scaled_train, y_train)
pred_train = model.predict(x_scaled_train)
model.score(x_scaled_train, y_train)

0.9382264585936675

In [79]:
pred_test = model.predict(x_scaled_test)
model.score(x_scaled_test, y_test)

0.5849130790499839

In [80]:
import numpy as np
from sklearn.metrics import mean_squared_error
MSE_train = mean_squared_error(y_train, pred_train)
MSE_test = mean_squared_error(y_test, pred_test)
print("훈련데이터 RMSE : ", np.sqrt(MSE_train))
print("평가데이터 RMSE : ", np.sqrt(MSE_test))

훈련데이터 RMSE :  23722.05419568687
평가데이터 RMSE :  61593.09088759614


그리드 서치

In [82]:
param_grid = {'n_estimators' : range(100, 500, 100), 'max_features' : ['auto', 'sqrt', 'log2']}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv = 5)
grid_search.fit(x_scaled_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': range(100, 500, 100)})

In [83]:
print("best parameter : {}".format(grid_search.best_params_))
print("best score : {:.4f}".format(grid_search.best_score_))
print("test set score : {:.4f}".format(grid_search.score(x_scaled_test, y_test)))

best parameter : {'max_features': 'log2', 'n_estimators': 400}
best score : 0.5690
test set score : 0.5946


랜덤 서치

In [84]:
param_distribs = {'n_estimators' : randint(low = 100, high = 500), 'max_features' : ['auto', 'sqrt', 'log2']}
random_search = RandomizedSearchCV(RandomForestRegressor(), param_distribs, n_iter = 20, cv = 5)
random_search.fit(x_scaled_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=20,
                   param_distributions={'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001441937F190>})

In [85]:
print("best parameter : {}".format(random_search.best_params_))
print("best score : {:.4f}".format(random_search.best_score_))
print("test set score : {:.4f}".format(random_search.score(x_scaled_test, y_test)))

best parameter : {'max_features': 'sqrt', 'n_estimators': 452}
best score : 0.5688
test set score : 0.5947
