In [21]:
# math 
import numpy as np

# dataframes
import pandas as pd

# plotting
import matplotlib.pyplot as plt
import seaborn

# linear regression two ways
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

# for choosing covariates to include in model
from patsy import dmatrices 

# model selection
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

In [22]:
diabetes = pd.read_csv('diabetes.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [23]:
y = diabetes['Outcome']
features = diabetes.columns[:-1]
X = diabetes[features]
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [24]:
model = sm.Logit(y,X).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.608498
         Iterations 5


0,1,2,3
Dep. Variable:,Outcome,No. Observations:,768.0
Model:,Logit,Df Residuals:,760.0
Method:,MLE,Df Model:,7.0
Date:,"Wed, 15 Apr 2020",Pseudo R-squ.:,0.05922
Time:,20:28:23,Log-Likelihood:,-467.33
converged:,True,LL-Null:,-496.74
,,LLR p-value:,2.583e-10

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Pregnancies,0.1284,0.029,4.484,0.000,0.072,0.185
Glucose,0.0129,0.003,4.757,0.000,0.008,0.018
BloodPressure,-0.0303,0.005,-6.481,0.000,-0.039,-0.021
SkinThickness,0.0002,0.006,0.032,0.974,-0.012,0.012
Insulin,0.0007,0.001,0.942,0.346,-0.001,0.002
BMI,-0.0048,0.011,-0.449,0.653,-0.026,0.016
DiabetesPedigreeFunction,0.3203,0.240,1.335,0.182,-0.150,0.790
Age,-0.0156,0.008,-1.852,0.064,-0.032,0.001


## model selection on training set, then model fit on test set

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [46]:
def minAIC(X,y):
    variables = X.columns
    model = sm.Logit(y_train,X_train[variables]).fit()
    while True:
        maxp = np.max(model.pvalues)
        variables = variables[model.pvalues < maxp]
        newmodel = sm.Logit(y_train,X_train[variables]).fit()
        if newmodel.aic < model.aic:
            model = newmodel
        else:
            break
    return model,variables

In [47]:
# select on training set, fit on test set 
model,variables = minAIC(X_train, y_train)
model = sm.Logit(y_test,X_test[variables]).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.625281
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.625281
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.625287
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.625404
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.627663
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.630760
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.592294
         Iterations 5


0,1,2,3
Dep. Variable:,Outcome,No. Observations:,384.0
Model:,Logit,Df Residuals:,381.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 15 Apr 2020",Pseudo R-squ.:,0.07713
Time:,20:53:22,Log-Likelihood:,-227.44
converged:,True,LL-Null:,-246.45
,,LLR p-value:,5.561e-09

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Pregnancies,0.1205,0.035,3.445,0.001,0.052,0.189
Glucose,0.0131,0.003,4.319,0.000,0.007,0.019
BloodPressure,-0.0378,0.006,-6.714,0.000,-0.049,-0.027


In [82]:
ingot = pd.read_csv('ingots.csv')
# NR means not ready
ingot = ingot.head(15)

In [79]:
ingot['failure'] = ingot['NR']
ingot['success'] = ingot['n'] - ingot['NR']

In [80]:
y = ingot[['success', 'failure']]
X = ingot[['heat', 'soak']]

In [81]:
model = sm.GLM(y, X, family=sm.families.Binomial(), intercept=True).fit()
model.summary()

0,1,2,3
Dep. Variable:,"['success', 'failure']",No. Observations:,15
Model:,GLM,Df Residuals:,13
Model Family:,Binomial,Df Model:,1
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-23.336
Date:,"Thu, 16 Apr 2020",Deviance:,34.281
Time:,09:49:19,Pearson chi2:,45.8
No. Iterations:,7,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
heat,0.0017,0.019,0.092,0.927,-0.035,0.038
soak,2.1039,0.371,5.665,0.000,1.376,2.832
