# 지도학습(Supervised Learning)
- 목표 변수(target, label, Y) 존재
- 설명 변수(feature, x)

## 데이터

In [1]:
import pandas as pd
df_BCD = pd.read_csv('/content/BreastCancerWisconsinDataSet.csv')
df_BCD[:2]

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,


### features,target 선택
- target : radius_mean
- features : texture_mean, perimeter_mean, area_mean

### 데이터 전처리(Pre-Processing)

In [2]:
# 컬럼 추출
# df_BCD.loc[:,['radius_mean','texture_mean', 'perimeter_mean', 'area_mean']][:2]
df_BCD_extract = df_BCD.loc[:,['radius_mean','texture_mean', 'perimeter_mean', 'area_mean']]
df_BCD_extract[:2]

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean
0,17.99,10.38,122.8,1001.0
1,20.57,17.77,132.9,1326.0


In [3]:
# 결측치와 이상치 확인
df_BCD_extract.isnull().sum()
# 공백 여부 확인 필요

radius_mean       0
texture_mean      0
perimeter_mean    0
area_mean         0
dtype: int64

### 데이터 분리 : train과 test set 분리
- target과 labels(features)분리
- train 과 test set 분리

In [4]:
target = df_BCD_extract['radius_mean']
features = df_BCD_extract[['texture_mean', 'perimeter_mean', 'area_mean']]
target.shape,features.shape

((569,), (569, 3))

In [5]:
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(features,target,test_size=0.3,random_state=2)
features_train.shape,features_test.shape,target_train.shape,target_test.shape
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

((398, 3), (171, 3), (398,), (171,))

## 모델

### 목표변수 따른 모델 선택 - 연속형
- 목표변수
- 설명변수

In [6]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [7]:
# 학습 진행
model.fit(features_train,target_train)

## 평가

In [8]:
from sklearn.metrics import r2_score
target_train_predict = model.predict(features_train)
target_train.shape,target_train_predict.shape # 원래 정답과 학습 후 정답
# https://scikit-learn.org/0.17/modules/generated/sklearn.metrics.r2_score.html
# r2_score 사각형의 넓이를 갖고 평가

((398,), (398,))

In [9]:
# train data set에 대한 평가
r2_score(target_train, target_train_predict)

0.9958836912117586

In [10]:
target_test_predict = model.predict(features_test)
target_test.shape,target_test_predict.shape

((171,), (171,))

In [11]:
r2_score(target_test,target_test_predict)

0.9963425578945869

#### 주요 사용 평가 도구
- MSE(Mean Squared Error, 에러율) : 작을수록 성능 좋음

In [15]:
from sklearn.metrics import mean_squared_error
mean_squared_error(target_test,target_test_predict)
## r2_square와 수치가 반대로 나옴

0.05342974768990013

## 서비스
- 사용자 입력 시 주의사항 : 학습 때 사용한 포멧을 그대로 유지


In [12]:
# 사용자 입력 :  [['texture_mean','perimeter_mean','area_mean']]
model.predict([[29.5,109.2,101.5]])




array([15.82878449])

In [13]:
# 재 사용 위해 model을 파일로 저장
import pickle # 메모리 인스턴스클래스를 이진파일로 저장

In [14]:
with open('BreastCancerWisconsin_LinearRegression.pkl','wb') as pickle_file:
  pickle.dump(obj=model,file=pickle_file)