# 로지스틱 회귀 모델
### 분류 문제를 해결하기 위한 가장 기본적인 머신러닝 해결법
- 이산적인 값을 갖는 목적 변수에 대해서 feature와 목적 변수 사이의 관계를 찾아냄

### 선형 회귀: (x, y)에 대해서 가설 함수를 선형 함수 h(x)로 모델링 -> 실제 y와 h(x) 사이의 제곱합이 작도록 하는 방법
### 로직스틱 회귀: 목적 변수가 A라면 1, B라면 0으로 고려 -> A, B가 될 확률을 각각 구해 x와 p 사이의 관계를 선형 함수를 통해서 얻어냄, 임의의 x에 대해서 p가 0.5보다 크면 A로 분류, 작으면 B로 분류

암기 필요 x, 출력이 실수값이 아니라는 부분 등은 알아야 함

In [1]:
import numpy as np
import matplotlib.pyplot as plt

## LogisticRegression 클래스 사용하기

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1234)

X_train

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
470,9.667,18.49,61.49,289.1,0.08946,0.06258,0.02948,0.01514,0.2238,0.06413,...,11.14,25.62,70.88,385.2,0.1234,0.1542,0.12770,0.06560,0.3174,0.08524
484,15.730,11.28,102.80,747.2,0.10430,0.12990,0.11910,0.06211,0.1784,0.06259,...,17.01,14.20,112.50,854.3,0.1541,0.2979,0.40040,0.14520,0.2557,0.08181
509,15.460,23.95,103.80,731.3,0.11830,0.18700,0.20300,0.08520,0.1807,0.07083,...,17.11,36.33,117.70,909.4,0.1732,0.4967,0.59110,0.21630,0.3013,0.10670
271,11.290,13.04,72.23,388.0,0.09834,0.07608,0.03265,0.02755,0.1769,0.06270,...,12.32,16.18,78.27,457.5,0.1358,0.1507,0.12750,0.08750,0.2733,0.08022
384,13.280,13.72,85.79,541.8,0.08363,0.08575,0.05077,0.02864,0.1617,0.05594,...,14.24,17.37,96.59,623.7,0.1166,0.2685,0.28660,0.09173,0.2736,0.07320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,13.850,15.18,88.99,587.4,0.09516,0.07688,0.04479,0.03711,0.2110,0.05853,...,14.98,21.74,98.37,670.0,0.1185,0.1724,0.14560,0.09993,0.2955,0.06912
372,21.370,15.10,141.30,1386.0,0.10010,0.15150,0.19320,0.12550,0.1973,0.06183,...,22.69,21.84,152.10,1535.0,0.1192,0.2840,0.40240,0.19660,0.2730,0.08666
204,12.470,18.60,81.09,481.9,0.09965,0.10580,0.08005,0.03821,0.1925,0.06373,...,14.97,24.64,96.05,677.9,0.1426,0.2378,0.26710,0.10150,0.3014,0.08750
53,18.220,18.70,120.30,1033.0,0.11480,0.14850,0.17720,0.10600,0.2092,0.06310,...,20.60,24.13,135.10,1321.0,0.1280,0.2297,0.26230,0.13250,0.3021,0.07987


In [8]:
X_train = X_train.iloc[:, :3]
X_test = X_test.iloc[:, :3]

In [9]:
clf = LogisticRegression(random_state=1234, max_iter=100, C=100, solver='lbfgs')

clf = clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_pred = clf.predict(X_test)

print(f'학습 데이터셋 정확도:{(y_train == y_train_pred).sum() / len(y_train) * 100: .2f}%')
print(f'테스트 데이터셋 정확도:{(y_test == y_pred).sum() / len(y_test) * 100: .2f}%')

학습 데이터셋 정확도: 93.18%
테스트 데이터셋 정확도: 87.23%


In [15]:
clf.coef_, clf.intercept_, clf.n_features_in_, clf.n_iter_

(array([[ 8.63354352, -0.27382884, -1.47585863]]),
 array([19.85290223]),
 3,
 array([45], dtype=int32))

In [17]:
# sag: 경사하강법
clf = LogisticRegression(random_state=1234, max_iter=100, C=100, solver='sag')

clf = clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_pred = clf.predict(X_test)

print(f'학습 데이터셋 정확도:{(y_train == y_train_pred).sum() / len(y_train) * 100: .2f}%')
print(f'테스트 데이터셋 정확도:{(y_test == y_pred).sum() / len(y_test) * 100: .2f}%')

학습 데이터셋 정확도: 70.34%
테스트 데이터셋 정확도: 72.87%




#### tips 예시

In [13]:
import pandas as pd

In [14]:
tips = pd.read_csv('tips.csv')
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


#### total bill과 tip을 통해 smoker를 분류하는 로지스틱 회귀 모델 학습 및 성능 확인

In [None]:
# X에는 total bill, tip, y에는 smoker
X_train, X_test, y_train, y_test = train_test_split(tips.iloc[:, :2], tips.iloc[:, 2], test_size=0.33, random_state=1234)

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()   # 분류의 정확도를 높이기 위해 스케일링 진행
scaler.fit(X_train) # 표준 정규분포를 이용하여 스케일링, transform 진행 위해 fit을 통해 평균, 표준편차 계산

X_train_scaled = scaler.transform(X_train)  # X_train 스케일링
X_test_scaled = scaler.transform(X_test)

In [20]:
clf = LogisticRegression(random_state=1234, max_iter=10000, C=1000, solver='sag')

In [21]:
clf = clf.fit(X_train_scaled, y_train)
y_train_pred = clf.predict(X_train_scaled)
y_pred = clf.predict(X_test_scaled)

print(f'학습 데이터셋 정확도:{(y_train == y_train_pred).sum() / len(y_train) * 100: .2f}%')
print(f'테스트 데이터셋 정확도:{(y_test == y_pred).sum() / len(y_test) * 100: .2f}%')

학습 데이터셋 정확도: 61.96%
테스트 데이터셋 정확도: 61.73%


In [22]:
clf.coef_, clf.intercept_, clf.n_features_in_, clf.n_iter_

(array([[0.05740051, 0.01046315]]),
 array([-0.4884633]),
 2,
 array([34], dtype=int32))

#### total bill과 tip을 통해 day를 분류하는 로지스틱 회귀 모델 학습 및 성능 확인(Thur vs Sun)

In [25]:
tips_selected = tips.loc[(tips.day == "Sun") | (tips.day == "Thur")]
tips_selected

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.50,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4
...,...,...,...,...,...,...
202,13.00,2.00,Yes,Thur,Lunch,2
203,16.40,2.50,Yes,Thur,Lunch,2
204,20.53,4.00,Yes,Thur,Lunch,4
205,16.47,3.23,Yes,Thur,Lunch,3


In [24]:
X_train, X_test, y_train, y_test = train_test_split(tips_selected.iloc[:, :2], tips_selected.iloc[:, 3], test_size=0.33, random_state=1234)

In [26]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
clf = LogisticRegression(random_state=1234, max_iter=10000, C=1000, solver='sag')

In [28]:
clf = clf.fit(X_train_scaled, y_train)
y_train_pred = clf.predict(X_train_scaled)
y_pred = clf.predict(X_test_scaled)

print(f'학습 데이터셋 정확도:{(y_train == y_train_pred).sum() / len(y_train) * 100: .2f}%')
print(f'테스트 데이터셋 정확도:{(y_test == y_pred).sum() / len(y_test) * 100: .2f}%')

학습 데이터셋 정확도: 67.39%
테스트 데이터셋 정확도: 60.87%


#### iris dataset에서 target 중 0과 1인 데이터에 대해서만 로지스틱 회귀 분류를 통해 정확도 확인

In [60]:
from sklearn.datasets import load_iris

X, y = load_iris(return_X_y=True, as_frame=True)
X = X.loc[(y == 0)| (y == 1), :]
y = y.loc[(y ==0) | (y == 1)]

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1234)

In [68]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)  # X_train 스케일링
X_test_scaled = scaler.transform(X_test)

In [69]:
clf = LogisticRegression(random_state=1234, max_iter=10000, C=1000, solver='lbfgs')
clf.fit(X_train_scaled, y_train)
y_train_pred = clf.predict(X_train_scaled)
y_pred = clf.predict(X_test_scaled)

print(f'학습 데이터셋 정확도:{(y_train == y_train_pred).sum() / len(y_train) * 100: .2f}%')
print(f'테스트 데이터셋 정확도:{(y_test == y_pred).sum() / len(y_test) * 100: .2f}%')

학습 데이터셋 정확도: 100.00%
테스트 데이터셋 정확도: 100.00%


#### digits dataset에서 0과 7인 데이터에 대해서만 로지스틱 회귀 분류를 통해 정확도 확인

In [73]:
from sklearn.datasets import load_digits

X, y = load_digits(return_X_y=True, as_frame=True)
X = X.loc[(y == 0) | (y == 7)]
y = y.loc[(y == 0) | (y == 7)]

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1234)

In [77]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)  # X_train 스케일링
X_test_scaled = scaler.transform(X_test)

In [78]:
clf = LogisticRegression(random_state=1234, max_iter=10000, C=1000, solver='lbfgs')
clf.fit(X_train_scaled, y_train)
y_train_pred = clf.predict(X_train_scaled)
y_pred = clf.predict(X_test_scaled)

print(f'학습 데이터셋 정확도:{(y_train == y_train_pred).sum() / len(y_train) * 100: .2f}%')
print(f'테스트 데이터셋 정확도:{(y_test == y_pred).sum() / len(y_test) * 100: .2f}%')

학습 데이터셋 정확도: 100.00%
테스트 데이터셋 정확도: 100.00%
