# 로지스틱 회귀(Logistic Regression)

In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import mglearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# from sklearn.datasets import load_breast_cancer, load_boston, load_diabetes
import warnings
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
# from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.special import expit

warnings.simplefilter(action='ignore', category=FutureWarning)
# FutureWarning제거

%matplotlib inline

mpl.rcParams['font.family'] = 'D2coding'  # 한글 깨짐 해결

## 데이터 로딩

In [2]:
fish = pd.read_csv('C:/k_digital/source/data/fish.csv')
fish.head()

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [5]:
fish['Species'].unique()

array(['Bream', 'Roach', 'Whitefish', 'Parkki', 'Perch', 'Pike', 'Smelt'],
      dtype=object)

In [6]:
fish_input = fish[['Weight', 'Length', 'Diagonal',
                   'Height', 'Width']].to_numpy()

In [8]:
fish_target = fish['Species'].to_numpy()

## 데이터 전처리

### 데이터분리

In [9]:
train_input, test_input, train_target, test_target = train_test_split(
    fish_input, fish_target, random_state=42)

### 피쳐 스케일링

In [12]:
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

## k-최근접 이웃 분류기를 이용한 확률 예측

In [22]:
kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(train_scaled, train_target)

print(kn.score(train_scaled, train_target))
print(kn.score(test_scaled, test_target))

0.8907563025210085
0.85


In [23]:
kn.classes_

array(['Bream', 'Parkki', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish'],
      dtype=object)

In [24]:
kn.predict(test_scaled[:5])

array(['Perch', 'Smelt', 'Pike', 'Perch', 'Perch'], dtype=object)

In [25]:
# predict_proba() : 클래스별 확률값을 반환해주는 메서드
proba = kn.predict_proba(test_scaled[:5])
# decimal : 유지할 소숫점 자릿수
np.round(proba, decimals=4)

array([[0.    , 0.    , 1.    , 0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.    , 0.    , 1.    , 0.    ],
       [0.    , 0.    , 0.    , 1.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.6667, 0.    , 0.3333, 0.    , 0.    ],
       [0.    , 0.    , 0.6667, 0.    , 0.3333, 0.    , 0.    ]])

In [26]:
distances, indexs = kn.kneighbors(test_scaled[3:4])
train_target[indexs]

array([['Roach', 'Perch', 'Perch']], dtype=object)

## 로지스틱 회귀
- 대표적인 분류 알고리즘
- 인공신경망에 기본이 되는 알고리즘
- 선형회귀와 비슷하게 선형 방정식을 학습하는 알고리즘

### 로지스틱 회귀(이진 분류)

In [34]:
# Bream, Smelt 만 이용한 이진분류 학습
indexes = (train_target == 'Bream') | (train_target == 'Smelt')
train = train_scaled[indexes]
target = train_target[indexes]

In [40]:
lr = LogisticRegression()
lr.fit(train, target)

LogisticRegression()

In [41]:
lr.predict(train[:5])

array(['Bream', 'Smelt', 'Bream', 'Bream', 'Bream'], dtype=object)

In [42]:
lr.predict_proba(train[:5])

array([[0.99759855, 0.00240145],
       [0.02735183, 0.97264817],
       [0.99486072, 0.00513928],
       [0.98584202, 0.01415798],
       [0.99767269, 0.00232731]])

In [43]:
print(lr.coef_, lr.intercept_)

[[-0.4037798  -0.57620209 -0.66280298 -1.01290277 -0.73168947]] [-2.16155132]


In [45]:
decisions = lr.decision_function(train[:5])
decisions

array([-6.02927744,  3.57123907, -5.26568906, -4.24321775, -6.0607117 ])

In [48]:
# 시그모이드 함수를 통해 확률값 추출
expit(decisions)

array([0.00240145, 0.97264817, 0.00513928, 0.01415798, 0.00232731])

### 로지스틱 회귀(다중분류)

In [49]:
lr = LogisticRegression(C=20, max_iter=1000)
lr.fit(train_scaled, train_target)
print(lr.score(train_scaled, train_target))
print(lr.score(test_scaled, test_target))

0.9327731092436975
0.925


In [50]:
lr.predict(test_scaled[:5])

array(['Perch', 'Smelt', 'Pike', 'Roach', 'Perch'], dtype=object)

In [51]:
proba = lr.predict_proba(test_scaled[:5])
np.round(proba, decimals=2)

array([[0.  , 0.01, 0.84, 0.  , 0.14, 0.01, 0.  ],
       [0.  , 0.  , 0.04, 0.  , 0.01, 0.95, 0.  ],
       [0.  , 0.  , 0.03, 0.93, 0.02, 0.02, 0.  ],
       [0.01, 0.03, 0.31, 0.01, 0.57, 0.  , 0.08],
       [0.  , 0.  , 0.9 , 0.  , 0.09, 0.  , 0.  ]])

In [52]:
lr.classes_

array(['Bream', 'Parkki', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish'],
      dtype=object)