In [6]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [None]:
# 데이터 로드
cancer_data = load_breast_cancer()

# 노이즈 생성
noise = np.random.RandomState(42).normal(size=(len(cancer_data.data), 50))

# 기존 데이터에 노이즈 추가
X_with_noise = np.hstack([cancer_data.data, noise])

print("원본 데이터 크기:", cancer_data.data.shape)
print("노이즈 추가된 데이터 크기:", X_with_noise.shape)


원본 데이터 크기: (569, 30)
노이즈 추가된 데이터 크기: (569, 80)


In [10]:
# 데이터 분리 (Train-Test Split)
X_train, X_test, y_train, y_test = train_test_split(X_with_noise, cancer_data.target, test_size=0.2, random_state=42)

# 로지스틱 회귀 모델 생성 및 학습
log_reg_model = LogisticRegression(max_iter=10000, random_state=42)  # max_iter 증가: 수렴 보장
log_reg_model.fit(X_train, y_train)

train_score = log_reg_model.score(X_train, y_train)  # 학습 데이터 정확도
test_score = log_reg_model.score(X_test, y_test) 

In [11]:
print(train_score, test_score)

0.9956043956043956 0.956140350877193


In [13]:
from sklearn.feature_selection import SelectPercentile, f_classif

select_feature = SelectPercentile( score_func = f_classif, percentile = 40 )
x_selected = select_feature.fit_transform(X_with_noise, cancer_data.target)

x_selected.shape

(569, 32)

In [None]:
x_train3, x_test3, y_train3, y_test3 = train_test_split(x_selected, cancer_data.target, test_size=0.2, random_state=42, stratify=cancer_data.target)

logreg3 = LogisticRegression