# 암 데이터 분석

## 라이브러리 로딩 

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, StratifiedKFold
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrix
import statsmodels.api as sm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline
plt.rc('font', family='D2coding')
%config Completer.use_jedi = False

## 데이터셋 로딩

In [2]:
cancer = load_breast_cancer()
print(cancer.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [3]:
print(cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [4]:
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
df.sample(frac=1).reset_index(drop=True)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,15.30,25.27,102.40,732.4,0.10820,0.16970,0.168300,0.087510,0.1926,0.06540,...,36.71,149.30,1269.0,0.1641,0.61100,0.63350,0.20240,0.4027,0.09876,0
1,16.50,18.29,106.60,838.1,0.09686,0.08468,0.058620,0.048350,0.1495,0.05593,...,25.45,117.20,1009.0,0.1338,0.16790,0.16630,0.09123,0.2394,0.06469,1
2,11.41,10.82,73.34,403.3,0.09373,0.06685,0.035120,0.026230,0.1667,0.06113,...,15.97,83.74,510.5,0.1548,0.23900,0.21020,0.08958,0.3016,0.08523,1
3,21.37,15.10,141.30,1386.0,0.10010,0.15150,0.193200,0.125500,0.1973,0.06183,...,21.84,152.10,1535.0,0.1192,0.28400,0.40240,0.19660,0.2730,0.08666,0
4,13.05,13.84,82.71,530.6,0.08352,0.03735,0.004559,0.008829,0.1453,0.05518,...,17.40,93.96,672.4,0.1016,0.05847,0.01824,0.03532,0.2107,0.06580,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,14.53,13.98,93.86,644.2,0.10990,0.09242,0.068950,0.064950,0.1650,0.06121,...,16.93,103.10,749.9,0.1347,0.14780,0.13730,0.10690,0.2606,0.07810,1
565,14.64,16.85,94.21,666.0,0.08641,0.06698,0.051920,0.027910,0.1409,0.05355,...,25.44,106.00,831.0,0.1142,0.20700,0.24370,0.07828,0.2455,0.06596,1
566,11.26,19.96,73.72,394.1,0.08020,0.11810,0.092740,0.055880,0.2595,0.06233,...,22.33,78.27,437.6,0.1028,0.18430,0.15460,0.09314,0.2955,0.07009,1
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.351400,0.152000,0.2397,0.07016,...,39.42,184.60,1821.0,0.1650,0.86810,0.93870,0.26500,0.4087,0.12400,0


In [5]:
scaler = MinMaxScaler()

df_scaled = df.iloc[:, :-1]  # 마지막열임을 나타내는 -1은 포함하지 않음
scaler.fit(df_scaled)
df_scaled = scaler.transform(df_scaled)

# 스케일링 변환된 값을 데이터프레임에 반영
df.iloc[:, :-1] = df_scaled[:, :]
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,...,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864,0
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878,0
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433,0
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,...,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711,0
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,...,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595,0


In [6]:
df.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64

In [7]:
order = df.corr().loc[:'worst fractal dimension', 'target'].abs().sort_values(ascending=False)
order

worst concave points       0.793566
worst perimeter            0.782914
mean concave points        0.776614
worst radius               0.776454
mean perimeter             0.742636
worst area                 0.733825
mean radius                0.730029
mean area                  0.708984
mean concavity             0.696360
worst concavity            0.659610
mean compactness           0.596534
worst compactness          0.590998
radius error               0.567134
perimeter error            0.556141
area error                 0.548236
worst texture              0.456903
worst smoothness           0.421465
worst symmetry             0.416294
mean texture               0.415185
concave points error       0.408042
mean smoothness            0.358560
mean symmetry              0.330499
worst fractal dimension    0.323872
compactness error          0.292999
concavity error            0.253730
fractal dimension error    0.077972
smoothness error           0.067016
mean fractal dimension     0

In [112]:
plot_cols = ['worst concave points', 'worst perimeter', 'mean concave points',
             'worst radius', 'mean perimeter', 'worst area', 'mean radius',
             'mean area', 'mean concavity', 'worst concavity',
             'mean compactness', 'worst compactness', 'radius error',
             'perimeter error', 'area error', 'worst texture',
             'worst smoothness', 'worst symmetry',
             'worst fractal dimension', 'target']
plot_df = df.loc[:, plot_cols]

In [113]:
# 학습데이터와 테스트데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    df[['worst concave points', 'worst perimeter', 'mean concave points',
        'worst radius', 'mean perimeter', 'worst area', 'mean radius',
        'mean area', 'mean concavity', 'worst concavity', 'mean compactness',
        'worst compactness', 'radius error', 'perimeter error', 'area error',
        'worst texture', 'worst smoothness', 'worst symmetry',
        'worst fractal dimension']],
    df.iloc[:, -1], random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(426, 19) (426,)
(143, 19) (143,)


## 다중회귀

In [124]:
poly = PolynomialFeatures(degree=1, include_bias=False)
poly.fit(X_train)
train_poly = poly.transform(X_train)
# 컬럼명 추출
print(poly.get_feature_names())
test_poly = poly.transform(X_test)

['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18']


## 선형회귀

In [125]:
# 선형 회귀 모형
lr = LinearRegression()
lr.fit(train_poly, y_train)

print("회귀계수(기울기):", np.round(lr.coef_, 1))  # np: numpy, coef_: 피처에 대한 회귀 계수 값
print("상수항(절편):", np.round(lr.intercept_, 1))  # intercept_: 상수항(절편) 값

회귀계수(기울기): [ 0.1  2.7 -1.6 -4.9 -6.6  2.4  6.1  0.2  0.7 -0.8  1.5 -0.5 -1.6 -0.3
  1.5 -0.5 -0.3 -0.3 -0.3]
상수항(절편): 1.7


## 성능평가

In [126]:
y_train_pred = lr.predict(train_poly)
y_test_pred = lr.predict(test_poly)

train_mse = mean_squared_error(y_train, y_train_pred) #훈련 데이터의 평가 점수

print("Train MSE : %.4f" % train_mse)
print('Train RMSE : %.4f' % np.sqrt(train_mse))
print('Train score : %.4f' % lr.score(train_poly, y_train))

test_mse = mean_squared_error(y_test, y_test_pred)
print("Test MSE : %.4f" % test_mse)
print('Test RMSE : %.4f' % np.sqrt(test_mse))
print('Test score : %.4f' % lr.score(test_poly, y_test))

Train MSE : 0.0556
Train RMSE : 0.2357
Train score : 0.7619
Test MSE : 0.0602
Test RMSE : 0.2453
Test score : 0.7441


In [129]:
VIF = df
X_train = df[['worst concave points', 'worst perimeter', 'mean concave points',
              'worst radius', 'mean perimeter', 'worst area', 'mean radius',
              'mean area', 'mean concavity', 'worst concavity', 'mean compactness',
              'worst compactness', 'radius error', 'perimeter error', 'area error',
              'worst texture', 'worst smoothness', 'worst symmetry',
              'worst fractal dimension']]

def feature_engineering_XbyVIF(X_train):
    vif = pd.DataFrame()
    vif['VIF_Factor'] = [variance_inflation_factor(X_train.values, i)
                         for i in range(X_train.shape[1])]
    vif['Feature'] = X_train.columns
    return vif
vif = feature_engineering_XbyVIF(X_train)
print(vif)


      VIF_Factor                  Feature
0      76.161434     worst concave points
1    1451.735230          worst perimeter
2      94.104247      mean concave points
3    2156.949284             worst radius
4   15541.549276           mean perimeter
5     521.432631               worst area
6   12877.552116              mean radius
7     606.550411                mean area
8      97.462201           mean concavity
9      71.349925          worst concavity
10    105.385593         mean compactness
11     59.334934        worst compactness
12    131.920946             radius error
13    113.389284          perimeter error
14     42.760523               area error
15      7.676011            worst texture
16     17.556424         worst smoothness
17      9.788939           worst symmetry
18     18.948110  worst fractal dimension
