# Binary Logistic Regression

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [102]:
admit=pd.read_csv('admission.csv')
admit.head()

Unnamed: 0,admit,gre,gpa,rank
0,0.0,380.0,3.61,3.0
1,1.0,660.0,3.67,3.0
2,1.0,800.0,4.0,1.0
3,1.0,640.0,3.19,4.0
4,0.0,520.0,2.93,4.0


In [103]:
admit.describe()

Unnamed: 0,admit,gre,gpa,rank
count,400.0,400.0,400.0,400.0
mean,0.3175,587.7,3.3899,2.485
std,0.466087,115.516536,0.380567,0.94446
min,0.0,220.0,2.26,1.0
25%,0.0,520.0,3.13,2.0
50%,0.0,580.0,3.395,2.0
75%,1.0,660.0,3.67,3.0
max,1.0,800.0,4.0,4.0


In [104]:
lr=LogisticRegression()
x_train,x_test,y_train,y_test=train_test_split(admit.drop('admit',axis=1),admit['admit'],test_size=0.2,random_state=42)

In [105]:
lr.fit(x_train,y_train)

In [107]:
lr.score(x_train,y_train)

0.721875

In [108]:
lr.coef_,lr.intercept_

(array([[ 0.00185267,  0.78309647, -0.60238049]]), array([-3.13755105]))

# Logit 계산
$$ logit(P)=log(\frac{P}{1-P})=-3.13+ 0.00185*gre+ 0.783*gpa + -0.602*rank$$



In [109]:
lr.predict_proba(x_train[:5])

array([[0.8656795 , 0.1343205 ],
       [0.48090025, 0.51909975],
       [0.33418352, 0.66581648],
       [0.84970142, 0.15029858],
       [0.71822265, 0.28177735]])

In [110]:
print(lr.score(x_test, y_test))

0.6625


In [111]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [113]:
lr.fit(x_train,y_train)
lr.score(x_train,y_train)

0.725

In [114]:
lr.score(x_test,y_test)

0.675

In [22]:
from sklearn.model_selection import cross_validate
scores=cross_validate(lr,x_train,y_train)
scores

{'fit_time': array([0.02408957, 0.03468227, 0.0320003 , 0.0240407 , 0.01690435]),
 'score_time': array([0.00800776, 0.02152348, 0.        , 0.00800657, 0.        ]),
 'test_score': array([0.78125 , 0.671875, 0.734375, 0.703125, 0.65625 ])}

In [27]:
from sklearn.model_selection import GridSearchCV

params={'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'penalty': ['l2']}

gs=GridSearchCV(lr,param_grid=params)

In [28]:
gs.fit(x_train,y_train)

In [29]:
gs.best_params_

{'C': 10, 'penalty': 'l2'}

In [32]:
gs.cv_results_['mean_test_score']

array([0.6875  , 0.696875, 0.70625 , 0.709375, 0.71875 , 0.71875 ])

# Nominal Logistic Regrssion

In [65]:

from sklearn.datasets import load_iris

iris=load_iris()

In [66]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [67]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [68]:
iris_data=pd.DataFrame(columns=iris['feature_names'],data=iris.data)
iris_data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [69]:
iris_data['target']=iris.target
iris_data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [70]:
iris_dic = {key: value for key, value in enumerate(iris.target_names)}
iris_dic

{0: 'setosa', 1: 'versicolor', 2: 'virginica'}

In [71]:
iris_data['target']=iris_data['target'].map(iris_dic)
iris_data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [72]:
iris_data.to_csv('iris.csv',index=False)

In [73]:
iris=pd.read_csv('iris.csv')
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [78]:
x_train,x_test,y_train,y_test=train_test_split(iris.drop(['target'],axis=1),iris.target,test_size=0.2,stratify=iris['target'])
regr=LogisticRegression(multi_class='multinomial',max_iter=1000)

In [79]:
regr.fit(x_train,y_train)

In [83]:
print(f" train score: {regr.score(x_train,y_train)} \n test score: {regr.score(x_test,y_test)}")

 train score: 0.975 
 test score: 0.9666666666666667


# Ordinal Logistic Regression

In [186]:
diamond=pd.read_csv('diamonds.csv')
diamond.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [187]:
diamond.cut.unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [188]:
from pandas.api.types import CategoricalDtype
cat=['Fair','Good','Very Good','Ideal','Premium']
cat_type=CategoricalDtype(categories=cat)
diamond.cut=diamond.cut.astype(cat_type)

In [189]:
diamond.cut.dtype

CategoricalDtype(categories=['Fair', 'Good', 'Very Good', 'Ideal', 'Premium'], ordered=False)

In [190]:
diamond['V']=diamond['x']*diamond['y']*diamond['z']
diamond.drop(['x','y','z'],axis=1,inplace=True)
diamond.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,V
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,38.20203
1,2,0.21,Premium,E,SI1,59.8,61.0,326,34.505856
2,3,0.23,Good,E,VS1,56.9,65.0,327,38.076885
3,4,0.29,Premium,I,VS2,62.4,58.0,334,46.72458
4,5,0.31,Good,J,SI2,63.3,58.0,335,51.91725


In [178]:
from statsmodels.miscmodels.ordinal_model import OrderedModel

In [179]:
diamond=diamond[['V','price','carat','cut']]
x_train,x_test,y_train,y_test=train_test_split(diamond.drop(['cut'],axis=1),diamond['cut'],test_size=0.2,random_state=42)

In [180]:
# v,price,carat을 독립변수로 설정, distr= probit or logit
# probit의 경우 누적분포 함수 역함수
om=OrderedModel(y_train,x_train,disr='logit')



In [181]:
cut_logit=om.fit()
cut_logit.summary()

  retvals = optimize.fmin(f, start_params, args=fargs, xtol=xtol,


0,1,2,3
Dep. Variable:,cut,Log-Likelihood:,-58940.0
Model:,OrderedModel,AIC:,117900.0
Method:,Maximum Likelihood,BIC:,118000.0
Date:,"Mon, 10 Jul 2023",,
Time:,16:43:25,,
No. Observations:,43152,,
Df Residuals:,43145,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
V,0.0032,0.001,5.104,0.000,0.002,0.004
price,6.442e-05,3.36e-06,19.173,0.000,5.78e-05,7.1e-05
carat,-1.0044,0.102,-9.874,0.000,-1.204,-0.805
Fair/Good,-2.0442,0.017,-117.355,0.000,-2.078,-2.010
Good/Very Good,-0.3348,0.015,-22.495,0.000,-0.364,-0.306
Very Good/Ideal,-0.2102,0.009,-23.722,0.000,-0.228,-0.193
Ideal/Premium,0.0570,0.006,8.851,0.000,0.044,0.070


In [182]:
cut_logit.params

V                  0.003153
price              0.000064
carat             -1.004445
Fair/Good         -2.044178
Good/Very Good    -0.334812
Very Good/Ideal   -0.210167
Ideal/Premium      0.056966
dtype: float64

In [184]:
# 0부터 4까지 class에 해당하는 확률
y_pred=cut_logit.predict(x_test)
y_pred[:5]

Unnamed: 0,0,1,2,3,4
1388,0.024834,0.081303,0.22496,0.401845,0.267057
50052,0.028294,0.088531,0.234939,0.399369,0.248868
41645,0.026678,0.085208,0.230445,0.400633,0.257036
42377,0.027281,0.086459,0.232156,0.400181,0.253923
17244,0.041909,0.113508,0.264141,0.384331,0.196111


In [185]:
import numpy as np
y_pred_cls=np.argmax(y_pred,axis=1)
y_pred_cls[:10]

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int64)

In [208]:
cat=['Fair','Good','Very Good','Ideal','Premium']
cat_dic={key:value for key,value in enumerate(cat) }
y_pred_cls=pd.DataFrame(y_pred_cls)[0].map(cat_dic)
y_pred_cls.unique()

array(['Ideal', 'Premium', 'Very Good'], dtype=object)

In [210]:
from sklearn.metrics import accuracy_score

# 최악.
acc=accuracy_score(y_test,y_pred_cls)
acc

0.39562476826103077