In [None]:
import pandas as pd
import numpy as np
from statsmodels.api import Logit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [None]:
df = pd.read_csv("iris.csv")
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [None]:
df["Species"].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [None]:
df["is_setosa"] = (df["Species"] == "setosa") + 0
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species,is_setosa
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1


In [None]:
df.iloc[:2, :2]

Unnamed: 0,Sepal.Length,Sepal.Width
0,5.1,3.5
1,4.9,3.0


In [None]:
model = Logit(endog = df['is_setosa'],
              exog = df.iloc[:, :2]).fit()
model

In [None]:
model = Logit(endog = df["is_setosa"],
              exog = df.iloc[:, :2]).fit()
model

Optimization terminated successfully.
         Current function value: 0.036374
         Iterations 11


<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x2a1d572cf10>

In [None]:
model.params

Sepal.Length    -7.529945
Sepal.Width     13.130734
dtype: float64

In [None]:
model.pvalues

Sepal.Length    0.000828
Sepal.Width     0.000989
dtype: float64

In [None]:
pred = model.predict(df.iloc[:3, :2])
pred

0    0.999477
1    0.923824
2    0.998678
dtype: float64

In [None]:
(pred > 0.5) + 0

0    1
1    1
2    1
dtype: int32

In [None]:
model = LogisticRegression(random_state = 123)
model.fit(X = df.iloc[:, :2],
          y = df["is_setosa"])
model

LogisticRegression(random_state=123)

In [None]:
model.coef_

array([[-3.38829757,  3.1645277 ]])

In [None]:
model.intercept_

array([8.32330389])

In [None]:
pred = model.predict_proba(df.iloc[:3, :2])
pred = pred[:, 1]
pred

array([0.89272024, 0.77104635, 0.92586179])

In [None]:
(pred > 0.5) + 0

array([1, 1, 1])

In [None]:
pred = model.predict_proba(df.iloc[:, :2])
pred = pred[:, 1]
pred[:10]

array([0.89272024, 0.77104635, 0.92586179, 0.92738323, 0.94126096,
       0.91436651, 0.97058885, 0.89484454, 0.93034007, 0.82210603])

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_true = df["is_setosa"],
              y_score = pred)

0.9999999999999999

In [None]:
accuracy_score(y_true = df["is_setosa"],
               y_pred = (pred > 0.9) + 0)

0.8333333333333334

In [1]:
import numpy as np

In [2]:
np.random.poisson()

0

## 1번

In [None]:
df = pd.read_csv("diabetes.csv")
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train, df_test = train_test_split(df, train_size = 0.8, random_state = 123)
df_train.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
318,3,115,66,39,140,38.1,0.15,28,0
313,3,113,50,10,85,29.5,0.626,25,0


In [None]:
model = Logit(endog = df_train["Outcome"],
              exog = df_train.loc[:, ["BloodPressure", "Glucose", "BMI", "Insulin"]]).fit()

Optimization terminated successfully.
         Current function value: 0.626579
         Iterations 5


In [None]:
pred = model.predict(exog = df_test.loc[:, ["BloodPressure", "Glucose", "BMI", "Insulin"]])
pred[:4]

236    0.462956
395    0.507051
36     0.359735
210    0.314389
dtype: float64

In [None]:
pred_class = (pred > 0.5) + 0
pred_class[:4]

236    0
395    1
36     0
210    0
dtype: int32

In [None]:
accuracy_score(y_pred = pred_class, y_true = df_test["Outcome"]) 

0.7012987012987013

## 2번

In [None]:
df = pd.read_csv("diabetes.csv")
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [None]:
model = Logit(endog = df["Outcome"],
              exog = df.loc[:, ["Glucose", "BMI", "Age"]]).fit()
model.params

Optimization terminated successfully.
         Current function value: 0.656276
         Iterations 4


Glucose    0.009368
BMI       -0.035639
Age       -0.012898
dtype: float64

In [None]:
np.exp(model.params)

Glucose    1.009412
BMI        0.964989
Age        0.987184
dtype: float64

## 3번

In [None]:
df = pd.read_csv("diabetes.csv")
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [None]:
model = Logit(endog = df["Outcome"],
              exog = df.loc[:, ["Glucose", "BMI", "Age"]]).fit()

Optimization terminated successfully.
         Current function value: 0.656276
         Iterations 4


In [None]:
model.predict(df.loc[:, ["Glucose", "BMI", "Age"]])[:5]

0    0.387961
1    0.365506
2    0.615678
3    0.392087
4    0.336654
dtype: float64

In [None]:
roc_auc_score(y_true = df["Outcome"],
              y_score = model.predict(df.loc[:, ["Glucose", "BMI", "Age"]]))

0.5414253731343283