# Logistic Regression with statsmodels

In [1]:
import pandas as pd
import numpy as np
import statsmodels.discrete.discrete_model as sm

In [20]:
df = pd.read_csv('penguins.csv')
df.fillna(df.mean(), inplace=True)

df['random'] = np.random.random(size=(344))  # for comparison
df['intercept'] = 1                          # statsmodels needs this

df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,random,intercept
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,0.971723,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,0.529477,1
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,0.928401,1
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,,0.691416,1
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,0.138673,1


In [21]:
df = df.sample(100)

In [22]:
X = df[['bill_length_mm', 'flipper_length_mm', 'random', 'intercept']]

In [23]:
y = (df['species'] == 'Adelie').astype(int) # is the penguin an Adelie or not?

In [24]:
logit = sm.Logit(y, X)  # equivalent of scikit.fit()
f = logit.fit()

Optimization terminated successfully.
         Current function value: 0.109668
         Iterations 10


In [25]:
print(f.params)  # in scikit: model.coef_, model.intercept_

bill_length_mm       -1.081021
flipper_length_mm    -0.098822
random                1.415279
intercept            64.881148
dtype: float64


In [14]:
print(f.summary())

                           Logit Regression Results                           
Dep. Variable:                species   No. Observations:                  100
Model:                          Logit   Df Residuals:                       96
Method:                           MLE   Df Model:                            3
Date:                Fri, 22 Jan 2021   Pseudo R-squ.:                  0.8061
Time:                        11:49:29   Log-Likelihood:                -13.344
converged:                       True   LL-Null:                       -68.814
Covariance Type:            nonrobust   LLR p-value:                 6.890e-24
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
bill_length_mm       -0.7492      0.206     -3.640      0.000      -1.153      -0.346
flipper_length_mm    -0.1584      0.065     -2.438      0.015      -0.286      -0.031
random                1.

In [None]:
# scikit     : a1 * x1 + a2 * x2 + a3 * x3 + b

# statsmodels: a1 * x1 + a2 * x2 + a3 * x3