In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
data = pd.read_csv("breast-cancer-wisconsin.csv", encoding = 'utf-8')

In [2]:
data.head()

Unnamed: 0,code,Clump_Thickness,Cell_Size,Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,0
1,1002945,5,4,4,5,7,10,3,2,1,0
2,1015425,3,1,1,1,2,2,3,1,1,0
3,1016277,6,8,8,1,3,4,3,7,1,0
4,1017023,4,1,1,3,2,1,3,1,1,0


In [3]:
x = data[data.columns[1:10]]
y = data[['Class']]

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, random_state = 42)

In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train)
x_scaled_train = scaler.transform(x_train)
x_scaled_test = scaler.transform(x_test)

## 로지스틱 회귀모델

In [6]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_scaled_train, y_train)
pred_train = model.predict(x_scaled_train)
model.score(x_scaled_train, y_train)

0.97265625

In [7]:
from sklearn.metrics import confusion_matrix
confusion_train = confusion_matrix(y_train, pred_train)
print("훈련데이터 오차행렬 : \n", confusion_train)

훈련데이터 오차행렬 : 
 [[328   5]
 [  9 170]]


In [8]:
from sklearn.metrics import classification_report
cfreport_train = classification_report(y_train, pred_train)
print("분류예측 레포트 : \n", cfreport_train)

분류예측 레포트 : 
               precision    recall  f1-score   support

           0       0.97      0.98      0.98       333
           1       0.97      0.95      0.96       179

    accuracy                           0.97       512
   macro avg       0.97      0.97      0.97       512
weighted avg       0.97      0.97      0.97       512



In [9]:
pred_test = model.predict(x_scaled_test)
model.score(x_scaled_test, y_test)

0.9590643274853801

In [10]:
confusion_test = confusion_matrix(y_test, pred_test)
print("테스트데이터 오차행렬 : \n", confusion_test)

테스트데이터 오차행렬 : 
 [[106   5]
 [  2  58]]


In [11]:
from sklearn.metrics import classification_report
cfreport_test = classification_report(y_test, pred_test)
print("분류예측 레포트 : \n", cfreport_test)

분류예측 레포트 : 
               precision    recall  f1-score   support

           0       0.98      0.95      0.97       111
           1       0.92      0.97      0.94        60

    accuracy                           0.96       171
   macro avg       0.95      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



그리드서치

In [12]:
param_grid = {'C' : [0.001, 0.01, 0.1, 1, 10, 100]}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv = 5)
grid_search.fit(x_scaled_train, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]})

In [13]:
print("beat parameter : {}".format(grid_search.best_params_))
print("best score : {:.4f}".format(grid_search.best_score_))
print("Test set score : {:.4f}".format(grid_search.score(x_scaled_test, y_test)))

beat parameter : {'C': 10}
best score : 0.9726
Test set score : 0.9591


랜덤서치

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distribs = {'C' : randint(low = 0.001, high = 100)}
random_search = RandomizedSearchCV(LogisticRegression(), param_distributions = param_distribs, n_iter = 100, cv = 5)
random_search.fit(x_scaled_train, y_train)

RandomizedSearchCV(cv=5, estimator=LogisticRegression(), n_iter=100,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000022A6974DB50>})

In [15]:
print("best parameter : {}".format(random_search.best_params_))
print("best score : {:.4f}".format(random_search.best_score_))
print("test set score : {:.4f}".format(random_search.score(x_scaled_test, y_test)))

best parameter : {'C': 12}
best score : 0.9745
test set score : 0.9591


## K-최근접이웃법

**분류**

In [16]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(x_scaled_train, y_train)
pred_train = model.predict(x_scaled_train)
model.score(x_scaled_train, y_train)

0.984375

In [17]:
from sklearn.metrics import confusion_matrix
confusion_train = confusion_matrix(y_train, pred_train)
print("훈련데이터 오차행렬 : \n", confusion_train)

훈련데이터 오차행렬 : 
 [[331   2]
 [  6 173]]


In [18]:
from sklearn.metrics import classification_report
cfreport_train = classification_report(y_train, pred_train)
print("분류예측 레포트 : \n", cfreport_train)

분류예측 레포트 : 
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       333
           1       0.99      0.97      0.98       179

    accuracy                           0.98       512
   macro avg       0.99      0.98      0.98       512
weighted avg       0.98      0.98      0.98       512



In [19]:
pred_test = model.predict(x_scaled_test)
model.score(x_scaled_test, y_test)

0.9532163742690059

In [20]:
confusion_test = confusion_matrix(y_test, pred_test)
print("테스트데이터 오차행렬 : \n", confusion_test)

테스트데이터 오차행렬 : 
 [[106   5]
 [  3  57]]


In [21]:
cfreport_test = classification_report(y_test, pred_test)
print("분류예측 레포트 : \n", cfreport_test)

분류예측 레포트 : 
               precision    recall  f1-score   support

           0       0.97      0.95      0.96       111
           1       0.92      0.95      0.93        60

    accuracy                           0.95       171
   macro avg       0.95      0.95      0.95       171
weighted avg       0.95      0.95      0.95       171



그리드 서치

In [24]:
param_grid = {'n_neighbors' : [1, 3, 5, 7, 9, 11]}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv = 5)
grid_search.fit(x_scaled_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11]})

In [25]:
print("best parameter : {}".format(grid_search.best_params_))
print("best score : {:.4f}".format(grid_search.best_score_))
print("test set score : {:.4f}".format(grid_search.score(x_scaled_test, y_test)))

best parameter : {'n_neighbors': 3}
best score : 0.9824
test set score : 0.9532


랜덤 서치

In [28]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {'n_neighbors' : randint(low = 1, high = 20)}
random_search = RandomizedSearchCV(KNeighborsClassifier(), param_distribs, n_iter = 20, cv = 5)
random_search.fit(x_scaled_train, y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsClassifier(), n_iter=20,
                   param_distributions={'n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000022A69B53460>})

In [29]:
print("best parameter : {}".format(random_search.best_params_))
print("best score : {:.4f}".format(random_search.best_score_))
print("test set score : {:.4f}".format(random_search.score(x_scaled_test, y_test)))

best parameter : {'n_neighbors': 3}
best score : 0.9824
test set score : 0.9532


**회귀**

In [31]:
data2 = pd.read_csv('house_price.csv', encoding = 'utf-8')
data2.head()

Unnamed: 0,housing_age,income,bedrooms,households,rooms,house_value
0,23,6.777,0.141112,2.442244,8.10396,500000
1,49,6.0199,0.160984,2.726688,5.752412,500000
2,35,5.1155,0.249061,1.902676,3.888078,500000
3,32,4.7109,0.231383,1.913669,4.508393,500000
4,21,4.5625,0.255583,3.092664,4.667954,500000


In [32]:
x = data2[data2.columns[1:5]]
y = data2[['house_value']]

In [48]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42)

In [49]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train)
x_scaled_train = scaler.transform(x_train)
x_scaled_test = scaler.transform(x_test)

In [52]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor()
model.fit(x_scaled_train, y_train)
pred_train = model.predict(x_scaled_train)
model.score(x_scaled_train, y_train)

0.6804607237174459

In [53]:
pred_test = model.predict(x_scaled_test)
model.score(x_scaled_test, y_test)

0.5541889571372401

In [54]:
import numpy as np
from sklearn.metrics import mean_squared_error
MSE_train = mean_squared_error(y_train, pred_train)
MSE_test = mean_squared_error(y_test, pred_test)
print("훈련데이터 RMSE : ", np.sqrt(MSE_train))
print("평가데이터 RMSE : ", np.sqrt(MSE_test))

훈련데이터 RMSE :  53952.69804097723
평가데이터 RMSE :  63831.91662964773


그리드 서치

In [55]:
param_grid = {'n_neighbors' : [1, 3, 5, 7, 9, 11]}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv = 5)
grid_search.fit(x_scaled_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11]})

In [56]:
print("best parameter : {}".format(grid_search.best_params_))
print("best score : {:.4f}".format(grid_search.best_score_))
print("test set score : {:.4f}".format(grid_search.score(x_scaled_test, y_test)))

best parameter : {'n_neighbors': 11}
best score : 0.5638
test set score : 0.5880


랜덤 서치

In [57]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

param_distribs = {'n_neighbors' : randint(low = 1, high = 20)}
random_search = RandomizedSearchCV(KNeighborsRegressor(), param_distribs, n_iter = 20, cv = 5)
random_search.fit(x_scaled_train, y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsRegressor(), n_iter=20,
                   param_distributions={'n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000022A69895FA0>})

In [58]:
print("best parameter : {}".format(random_search.best_params_))
print("best score : {:.4f}".format(random_search.best_score_))
print("test set score : {:.4f}".format(random_search.score(x_scaled_test, y_test)))

best parameter : {'n_neighbors': 18}
best score : 0.5769
test set score : 0.5992
