# 암 데이터 분석

## 라이브러리 로딩 

In [132]:
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, StratifiedKFold
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrix
import statsmodels.api as sm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline
plt.rc('font', family='D2coding')
%config Completer.use_jedi = False

## 데이터셋 로딩

In [117]:
cancer = load_breast_cancer()
print(cancer.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [119]:
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
df.sample(frac=1).reset_index(drop=True)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,16.740,21.59,110.10,869.5,0.09610,0.13360,0.13480,0.06018,0.1896,0.05656,...,29.02,133.50,1229.0,0.15630,0.3835,0.54090,0.18130,0.4863,0.08633,0
1,14.480,21.46,94.25,648.2,0.09444,0.09947,0.12040,0.04938,0.2075,0.05636,...,29.25,108.40,808.9,0.13060,0.1976,0.33490,0.12250,0.3020,0.06846,0
2,8.878,15.49,56.74,241.0,0.08293,0.07698,0.04721,0.02381,0.1930,0.06621,...,17.70,65.27,302.0,0.10150,0.1248,0.09441,0.04762,0.2434,0.07431,1
3,18.770,21.43,122.90,1092.0,0.09116,0.14020,0.10600,0.06090,0.1953,0.06083,...,34.37,161.10,1873.0,0.14980,0.4827,0.46340,0.20480,0.3679,0.09870,0
4,19.210,18.57,125.50,1152.0,0.10530,0.12670,0.13230,0.08994,0.1917,0.05961,...,28.14,170.10,2145.0,0.16240,0.3511,0.38790,0.20910,0.3537,0.08294,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,10.480,14.98,67.49,333.6,0.09816,0.10130,0.06335,0.02218,0.1925,0.06915,...,21.57,81.41,440.4,0.13270,0.2996,0.29390,0.09310,0.3020,0.09646,1
565,20.480,21.46,132.50,1306.0,0.08355,0.08348,0.09042,0.06022,0.1467,0.05177,...,26.17,161.70,1750.0,0.12280,0.2311,0.31580,0.14450,0.2238,0.07127,0
566,19.070,24.81,128.30,1104.0,0.09081,0.21900,0.21070,0.09961,0.2310,0.06343,...,33.17,177.40,1651.0,0.12470,0.7444,0.72420,0.24930,0.4670,0.10380,0
567,12.960,18.29,84.18,525.2,0.07351,0.07899,0.04057,0.01883,0.1874,0.05899,...,24.61,96.31,621.9,0.09329,0.2318,0.16040,0.06608,0.3207,0.07247,1


In [120]:
scaler = MinMaxScaler()

df_scaled = df.iloc[:, :-1]  # 마지막열임을 나타내는 -1은 포함하지 않음
scaler.fit(df_scaled)
df_scaled = scaler.transform(df_scaled)

# 스케일링 변환된 값을 데이터프레임에 반영
df.iloc[:, :-1] = df_scaled[:, :]
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,...,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864,0
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878,0
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433,0
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,...,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711,0
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,...,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595,0


In [121]:
df.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64

In [122]:
order = df.corr().loc[:'worst fractal dimension', 'target'].abs().sort_values(ascending=False)
order

worst concave points       0.793566
worst perimeter            0.782914
mean concave points        0.776614
worst radius               0.776454
mean perimeter             0.742636
worst area                 0.733825
mean radius                0.730029
mean area                  0.708984
mean concavity             0.696360
worst concavity            0.659610
mean compactness           0.596534
worst compactness          0.590998
radius error               0.567134
perimeter error            0.556141
area error                 0.548236
worst texture              0.456903
worst smoothness           0.421465
worst symmetry             0.416294
mean texture               0.415185
concave points error       0.408042
mean smoothness            0.358560
mean symmetry              0.330499
worst fractal dimension    0.323872
compactness error          0.292999
concavity error            0.253730
fractal dimension error    0.077972
smoothness error           0.067016
mean fractal dimension     0

In [128]:
# 학습데이터와 테스트데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    df[['worst concave points', 'worst perimeter', 'mean concave points',
        'worst radius', 'mean perimeter', 'worst area', 'mean radius',
        'mean area', 'mean concavity', 'worst concavity', 'mean compactness',
        'worst compactness', 'radius error', 'perimeter error', 'area error',
        'worst texture', 'worst smoothness', 'worst symmetry',
        'worst fractal dimension']],
    df.iloc[:, -1], test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(455, 19) (455,)
(114, 19) (114,)


## 다중회귀

In [129]:
poly = PolynomialFeatures(degree=1, include_bias=False)
poly.fit(X_train)
train_poly = poly.transform(X_train)
# 컬럼명 추출
print(poly.get_feature_names())
test_poly = poly.transform(X_test)

['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18']


## 선형회귀

In [130]:
# 선형 회귀 모형
lr = LinearRegression()
lr.fit(train_poly, y_train)

print("회귀계수(기울기):", np.round(lr.coef_, 1))  # np: numpy, coef_: 피처에 대한 회귀 계수 값
print("상수항(절편):", np.round(lr.intercept_, 1))  # intercept_: 상수항(절편) 값

회귀계수(기울기): [ 0.1  2.3 -1.5 -5.2 -5.6  2.7  5.5 -0.   0.3 -0.4  1.6 -0.6 -1.2 -0.6
  1.3 -0.4 -0.3 -0.3 -0.3]
상수항(절편): 1.7


## 성능평가

In [131]:
y_train_pred = lr.predict(train_poly)
y_test_pred = lr.predict(test_poly)

train_mse = mean_squared_error(y_train, y_train_pred) #훈련 데이터의 평가 점수

print("Train MSE : %.4f" % train_mse)
print('Train RMSE : %.4f' % np.sqrt(train_mse))
print('Train score : %.4f' % lr.score(train_poly, y_train))

test_mse = mean_squared_error(y_test, y_test_pred)
print("Test MSE : %.4f" % test_mse)
print('Test RMSE : %.4f' % np.sqrt(test_mse))
print('Test score : %.4f' % lr.score(test_poly, y_test))

Train MSE : 0.0549
Train RMSE : 0.2344
Train score : 0.7646
Test MSE : 0.0598
Test RMSE : 0.2446
Test score : 0.7452
