In [1]:
import numpy as np 
import pandas as pd 
import statsmodels.api as sm

In [196]:
df = sm.datasets.fair.load_pandas().data

In [198]:
df.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666


In [200]:
def affair_check(x):
    if x != 0:
        return 1
    else:
        return 0
df['had_affair'] = df['affairs'].apply(affair_check)


In [202]:
df.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs,had_affair
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111,1
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769,1
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4,1
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273,1
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666,1


In [204]:
df['had_affair'].value_counts()

had_affair
0    4313
1    2053
Name: count, dtype: int64

In [206]:
df.groupby('had_affair').mean()

Unnamed: 0_level_0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
had_affair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,4.329701,28.390679,7.989335,1.238813,2.504521,14.322977,3.405286,3.833758,0.0
1,3.647345,30.537019,11.15246,1.728933,2.261568,13.972236,3.463712,3.884559,2.187243


In [209]:
# Create column names for the new DataFrames
occ_dummy = pd.get_dummies(df['occupation']).astype(int)
hus_dummy = pd.get_dummies(df['occupation_husb']).astype(int)

In [211]:
# Set X as new DataFrame without the occupation columns or the Y target
X = df.drop(columns=['occupation', 'occupation_husb', 'affairs'], axis=1)

In [213]:
# Concat the dummy DataFrames Together
occ_dummy.columns = ['occ1','occ2','occ3','occ4','occ5', 'occ6']
hus_dummy.columns = ['hocc1','hocc2','hocc3','hocc4','hocc5','hocc6']
dummies = pd.concat([occ_dummy, hus_dummy], axis=1)

In [216]:
# Now Concat the X DataFrame with the dummy variables
X = pd.concat([X, dummies],axis=1)
X.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,had_affair,occ1,occ2,occ3,occ4,occ5,occ6,hocc1,hocc2,hocc3,hocc4,hocc5,hocc6
0,3.0,32.0,9.0,3.0,3.0,17.0,1,0,1,0,0,0,0,0,0,0,0,1,0
1,3.0,27.0,13.0,3.0,1.0,14.0,1,0,0,1,0,0,0,0,0,0,1,0,0
2,4.0,22.0,2.5,0.0,1.0,16.0,1,0,0,1,0,0,0,0,0,0,0,1,0
3,4.0,37.0,16.5,4.0,3.0,16.0,1,0,0,0,0,1,0,0,0,0,0,1,0
4,5.0,27.0,9.0,1.0,1.0,14.0,1,0,0,1,0,0,0,0,0,0,1,0,0


In [218]:
# Set Y as Target class, Had Affair
y = X['had_affair']

In [220]:
# Dropping one column of each dummy variable set to avoid multicollinearity
X.drop('occ1', axis=1, inplace=True)
X.drop('hocc1', axis=1, inplace=True)
# Drop affairs column so Y target makes sense
X.drop('had_affair', axis=1, inplace=True)


In [6]:
# This adds a column of 1's to the dataframe. 
# The model will not run without, but if 
# it could every model would try to pass through the origin


In [222]:
logit = sm.Logit(y, sm.add_constant(X))

In [224]:
#result of preliminary run
result.summary()

0,1,2,3
Dep. Variable:,had_affair,No. Observations:,6366.0
Model:,Logit,Df Residuals:,6360.0
Method:,MLE,Df Model:,5.0
Date:,"Tue, 25 Feb 2025",Pseudo R-squ.:,0.1062
Time:,21:16:56,Log-Likelihood:,-3577.5
converged:,True,LL-Null:,-4002.5
Covariance Type:,nonrobust,LLR p-value:,1.764e-181

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
rate_marriage,-0.4990,0.027,-18.767,0.000,-0.551,-0.447
age,0.0523,0.006,9.395,0.000,0.041,0.063
yrs_married,0.0219,0.007,3.364,0.001,0.009,0.035
religious,-0.2494,0.033,-7.603,0.000,-0.314,-0.185
occ5,0.4622,0.085,5.416,0.000,0.295,0.630
occ6,0.3145,0.214,1.469,0.142,-0.105,0.734


In [228]:
X.drop(['children', 'educ', 'occ2', 'occ3', 'occ4','occ6', 
        'hocc2','hocc3','hocc4','hocc5','hocc6'], axis=1, inplace=True)

In [230]:
logit = sm.Logit(y, X)
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.562140
         Iterations 5


In [232]:
preds = result.pred_table()
preds

array([[3982.,  331.],
       [1522.,  531.]])

In [234]:
confusion = pd.DataFrame(preds, index=['Did Not Affair', 'Had Affair'], columns=['predicted not', 'predicted affair'])
confusion

Unnamed: 0,predicted not,predicted affair
Did Not Affair,3982.0,331.0
Had Affair,1522.0,531.0


In [236]:
TP = preds[1,1]
FP = preds[0,1]
TN = preds[0,0]
FN = preds[1,0]

In [241]:
TPR=(float(TP) / (TP + FN))
TPN=(float(TN) / (TN + FP)) 
PPV=(float(TP) / (TP + FP)) 
NPV=(float(TN) / (TN + FN)) 
FNR=(float(FN) / (FN + TP))
FPR=(float(FP) / (FP + TN))
FDR=(float(FP) / (FP + TP))
FOR=(float(FN) / (FN + TN))
TS=(float(TP) / (TP+FN + FP))
ACC=(float(TP+TN) / (TP+FP+FN + TN))  #print((TP + TN) / float(len(y_test)))

print (f"sensitivity, recall, hit rate, or true positive rate (TPR): {TPR:.3f} (# positives correctly identified)")
print (f"specificity, selectivity or true negative rate (TNR): {TPN:.3f}")
print (f"precision or positive predictive value (PPV): {PPV:.3f} (rate of correct positive predictions)")
print (f"negative predictive value (NPV): {NPV:.3f}")
print (f"miss rate or false negative rate (FNR): {FNR:.3f}")
print (f"fall-out or false positive rate (FPR): {FPR:.3f}")
print (f"false discovery rate (FDR): {FDR:.3f}")
print (f"false omission rate (FOR): {FOR:.3f}")
print("")
print (f"accuracy (ACC): {ACC:.3f} (really only useful if classes are equally represented)")


sensitivity, recall, hit rate, or true positive rate (TPR): 0.259 (# positives correctly identified)
specificity, selectivity or true negative rate (TNR): 0.923
precision or positive predictive value (PPV): 0.616 (rate of correct positive predictions)
negative predictive value (NPV): 0.723
miss rate or false negative rate (FNR): 0.741
fall-out or false positive rate (FPR): 0.077
false discovery rate (FDR): 0.384
false omission rate (FOR): 0.277

accuracy (ACC): 0.709 (really only useful if classes are equally represented)
