In [1]:
# 이항 로지스틱
# 종속변수가 0 또는 1 , 베르누이 분포를 따를경우 사용
# 모델의 산출값은 각 데이터가 1이 될 확률이며 이진분류를 위해서 경계값 이 필요
# 모델 평가를 위해 각종 분류관련지표 및 AUC 활용

In [3]:
# 승산비 (Odds Ratio, OR)
# 특정 독립변수를 제외한 나머지값을 고정하고
# 해당 독립변수가 1증가 시 변화하는 승산의 비

In [None]:
# statsmodels = Logit()
# 로지스틱회귀분석을 실시하는 stats models 의 함수

In [4]:
import pandas as pd
import numpy as np
from statsmodels.api import Logit

In [5]:
df = pd.read_csv('iris.csv')
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [6]:
df["Species"].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [7]:
df["is_setosa"] = (df["Species"] == "setosa") + 0;
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species,is_setosa
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1


In [9]:
model = Logit(endog=df["is_setosa"], exog=df.iloc[:,:2]).fit() # endog : 종속, exog : 독립변수
# 종속변수 : is_setosa, 독립변수 : sepal length, width
model

Optimization terminated successfully.
         Current function value: 0.036374
         Iterations 11


<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x2424d7c9880>

In [11]:
model.params

Sepal.Length    -7.529945
Sepal.Width     13.130734
dtype: float64

In [12]:
model.pvalues

Sepal.Length    0.000828
Sepal.Width     0.000989
dtype: float64

In [13]:
model.tvalues

Sepal.Length   -3.343109
Sepal.Width     3.293594
dtype: float64

In [15]:
model.predict(df.iloc[:3, :2])

0    0.999477
1    0.923824
2    0.998678
dtype: float64

In [17]:
pred = model.predict(df.iloc[:3, :2])
pred

0    0.999477
1    0.923824
2    0.998678
dtype: float64

In [18]:
(pred>0.5) + 0

0    1
1    1
2    1
dtype: int32

In [None]:
# sklearn - Logistic Regression()
# logit 보다 많은게 있음

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
model = LogisticRegression(random_state=123)
model.fit(X = df.iloc[:, :2], y = df["is_setosa"])

LogisticRegression(random_state=123)

In [21]:
model.coef_

array([[-3.38829757,  3.1645277 ]])

In [24]:
model.intercept_

array([8.32330389])

In [25]:
model.predict_proba(df.iloc[:3,:2])

array([[0.10727976, 0.89272024],
       [0.22895365, 0.77104635],
       [0.07413821, 0.92586179]])

In [26]:
pred = model.predict_proba(df.iloc[:3,:2])
pred = pred[:, 1]
pred

array([0.89272024, 0.77104635, 0.92586179])

In [27]:
(pred > 0.5) + 0

array([1, 1, 1])

In [None]:
# sklearn - roc_auc_score()
# auc 를 산출하는 함수

In [29]:
pred = model.predict_proba(df.iloc[:,:2])
pred = pred[:, 1]
pred [:10]

array([0.89272024, 0.77104635, 0.92586179, 0.92738323, 0.94126096,
       0.91436651, 0.97058885, 0.89484454, 0.93034007, 0.82210603])

In [30]:
from sklearn.metrics import roc_auc_score

In [32]:
roc_auc_score(y_true = df["is_setosa"], y_score = pred)

0.9999999999999999

In [33]:
# 1에 가까울수록 좋음.

In [36]:
from sklearn.metrics import accuracy_score

In [40]:
accuracy_score(y_true = df["is_setosa"], y_pred = (pred>0.5)+ 0)

1.0

In [41]:
accuracy_score(y_true = df["is_setosa"], y_pred = (pred>0.8)+ 0)

0.9466666666666667

In [42]:
# 문제 1

In [43]:
df = pd.read_csv('diabetes.csv')
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [44]:
from sklearn.model_selection import train_test_split

In [46]:
df_train , df_test = train_test_split(df, train_size=0.8, random_state=123)
df_train.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
318,3,115,66,39,140,38.1,0.15,28,0
313,3,113,50,10,85,29.5,0.626,25,0


In [48]:
model = Logit(endog = df_train["Outcome"], 
              exog=df_train.loc[:,["BloodPressure", "Glucose", "BMI", "Insulin"]]).fit()

Optimization terminated successfully.
         Current function value: 0.626579
         Iterations 5


In [50]:
pred = model.predict(exog=df_test.loc[:,["BloodPressure", "Glucose", "BMI", "Insulin"]])
pred[:4]

236    0.462956
395    0.507051
36     0.359735
210    0.314389
dtype: float64

In [51]:
pred_class = (pred > 0.5) + 0
pred_class[:4]

236    0
395    1
36     0
210    0
dtype: int32

In [53]:
accuracy_score(y_pred = pred_class, y_true = df_test["Outcome"])

0.7012987012987013

In [54]:
# 문제 2

In [56]:
model = Logit(endog = df["Outcome"], 
              exog=df.loc[:,["Glucose", "BMI", "Age"]]).fit()
model.params

Optimization terminated successfully.
         Current function value: 0.656276
         Iterations 4


Glucose    0.009368
BMI       -0.035639
Age       -0.012898
dtype: float64

In [57]:
np.exp(model.params)

Glucose    1.009412
BMI        0.964989
Age        0.987184
dtype: float64

In [58]:
# 이게 odds ratio

In [59]:
# 문제 3

In [60]:
model.predict(df.loc[:,["Glucose", "BMI", "Age"]])[:5]

0    0.387961
1    0.365506
2    0.615678
3    0.392087
4    0.336654
dtype: float64

In [61]:
roc_auc_score(y_true = df['Outcome'], y_score = model.predict(df.loc[:,["Glucose", "BMI", "Age"]]))

0.5414253731343283