<a href="https://colab.research.google.com/github/mjiii25/posco-academy/blob/main/Big-Data/PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns


from sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [7]:
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [9]:
df_raw = pd.read_csv('/content/drive/MyDrive/유방암.csv', encoding = 'euc-kr')

print(df_raw.shape)
df_raw.head()

(320, 31)


Unnamed: 0,diagnosis,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
0,음성,15.12,16.68,98.78,716.6,0.08876,0.09588,0.0755,0.04079,0.1594,...,17.77,20.24,117.7,989.5,0.1491,0.3331,0.3327,0.1252,0.3415,0.0974
1,양성,10.8,9.71,68.77,357.6,0.09594,0.05736,0.02531,0.01698,0.1381,...,11.6,12.02,73.66,414.0,0.1436,0.1257,0.1047,0.04603,0.209,0.07699
2,음성,18.31,18.58,118.6,1041.0,0.08588,0.08468,0.08169,0.05814,0.1621,...,21.31,26.36,139.2,1410.0,0.1234,0.2445,0.3538,0.1571,0.3206,0.06938
3,양성,14.92,14.93,96.45,686.9,0.08098,0.08549,0.05539,0.03221,0.1687,...,17.18,18.22,112.0,906.6,0.1065,0.2791,0.3151,0.1147,0.2688,0.08273
4,양성,12.89,13.12,81.89,515.9,0.06955,0.03729,0.0226,0.01171,0.1337,...,13.62,15.54,87.4,577.0,0.09616,0.1147,0.1186,0.05366,0.2309,0.06915


In [10]:
df_raw_x = df_raw.drop("diagnosis",axis = 1)
df_raw_y = df_raw["diagnosis"]


df_raw_y.head()

0    음성
1    양성
2    음성
3    양성
4    양성
Name: diagnosis, dtype: object

In [None]:
# 산점도 행렬
sns.pairplot(df_raw_x)

In [None]:
# Pearson 상관계수 : pearson, kendall, spearman
df_raw_x.corr().round(3)

In [None]:
# 요약 통계량 -> Scale 변환 필요 확인
df_raw_x.describe().round(3)

In [None]:
# 변수 이름 저장
x_cols = df_raw_x.columns

# Scale 변환:ndarray 형식
scaler = StandardScaler()
nd_scaled = scaler.fit_transform(df_raw_x)

# 변환된 데이터 및 형태 확인
print("Scaled Data :\n {}".format(nd_scaled[:5].round(3)))
print("Shape :\n {}".format(nd_scaled.shape))

In [None]:
# DataFrame으로 저장
df_scaled = pd.DataFrame(nd_scaled, columns=x_cols)
df_scaled.head().round(3)

In [None]:
# 주성분 분석:주성분1(Prin1)~주성분6(Prin6) 생성 및 저장
pca = PCA(n_components=6)
nd_pca = pca.fit_transform(df_scaled)

# 생성된 주성분 점수 확인
print("PCA Data :\n {}".format(nd_pca[:5]))

# 생성된 주성분 점수 Row, Column 개수 확인
print("Shape :\n {}".format(nd_pca.shape))

In [None]:
# 주성분 점수 저장(Prin1~Prin6, DataFrame형식) 및 확인
df_pca = pd.DataFrame(nd_pca, columns = ["Prin1","Prin2","Prin3","Prin4","Prin5","Prin6"])

# 주성분 분석용 데이터와 주성분 점수 결합
df_scaled_pca = df_scaled.join(df_pca)
df_scaled_pca.head()

In [None]:
# 주성분 분석 통계량 저장:주성분 번호, Eigenvalue, Eigenvalue ratio 
df_pca_stat = pd.DataFrame()
df_pca_stat["PrinNo"] = [I for I in range(1,7)]
df_pca_stat["EigenValues"] = pca.explained_variance_
df_pca_stat["EigenValueRatio"] = pca.explained_variance_ratio_
df_pca_stat["CumEigenValueRatio"] = np.cumsum(pca.explained_variance_ratio_)
df_pca_stat.round(3)

In [None]:
# Scree Plot
plt.plot(df_pca_stat["PrinNo"], df_pca_stat["EigenValues"],label="고유값")

# 그래프 label, legend 설정
plt.ylabel("고유값(Eigenvalue)")
plt.xlabel("주성분 번호")
plt.legend(loc="best")

In [None]:
# 고유값 비율 그래프 생성
plt.plot(df_pca_stat["PrinNo"], df_pca_stat["EigenValueRatio"],label="고유값")

# 누적된 고유값 비율 그래프 생성
plt.plot(df_pca_stat["PrinNo"], df_pca_stat["CumEigenValueRatio"],label="누적 고유값")
plt.ylabel("고유값 비율")
plt.xlabel("주성분 번호")
plt.legend(loc="best")

In [None]:
# 주성분 분석 데이터의 변수 이름
x_col = pd.DataFrame({"Feature":df_raw_x.columns})

# 주성분 Prin 1~6까지의 고유벡터 저장
# 전치 결과 pd.DataFrame(pca.components_columns = ["prin1","prin2","prin3","prin4","prin5,"prin6"])], axis=1)
# 파이썬 PCA 분석 시 SVD연산 방식의 차이 반영(전치 부호 변경)
df_eigenvector = pd.concat([x_cols,pd.DataFrame(pca.components_.T * -1,
                                                columns = ["prin1","prin2","prin3","prin4","prin5","prin6"])], axis=1)
df_eigenvector.round(3)

In [None]:
# 산점도 행렬
sns.pairplot(df_pca)

In [None]:
# 상관계수
df_pca.corr().round(3)

In [None]:
# 다중선형 회귀분석 : 원천 데이터 활용
# 원천 데이터에 절편(const) 추가
df_raw_x_const = sm.add_constant(df_raw_x)

# 회귀모델 생성
reg_model = sm.OLS(df_raw_y, df_raw_x_const)

# 회귀모델 적합
reg_model = reg_model.fit()

# 모델 성능 확인
print(reg_model.summary())

In [None]:
# * 참조: Scale 변환 데이터를 이용한 회귀분석
# 다중선형 회귀분석 : Scale 변환 데이터 활용
# Scale 변환 데이터에 절편(const) 컬럼 추가
df_scaled_const = sm.add_constant(df_scaled)
reg_model = sm.OLS(df_raw_y,df_scaled_const)
reg_model = reg_model.fit()
print(reg_model.summary())

In [None]:
# 다중선형 주성분 회귀분석
# Eigenvalue, Scree plot 기준으로 적정 주성분 선택
x_var = ["Prin1", "Prin2", "Prin3"]

# 선택된 주성분 점수 데이터에 절편(const) 컬럼 추가
df_pca_const = sm.add_constant(df_pca[x_var])

# 회귀모델 생성
reg_model = sm.OLS(df_raw_y, df_pca_const)

# 회귀모델 적합
reg_model = reg_model.fit()

# 모델 성능 확인
print(reg_model.summary())