In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [2]:
# Load data
dta = sm.datasets.fair.load_pandas().data

# Add "affair" column: 1 represents having affairs, 0 represents not having affairs
dta['affair'] = (dta.affairs > 0).astype(int)

# Prepare the data for logistic regression
# `dmatrices` allows us to create design matrices using the patsy library
y, X = dmatrices('affair ~ rate_marriage + age + yrs_married + children + \
                 religious + educ + C(occupation) + C(occupation_husb)',
                 dta, return_type="dataframe")

# Rename columns for ease of use
X = X.rename(columns={
    'C(occupation)[T.2.0]': 'occ_2',
    'C(occupation)[T.3.0]': 'occ_3',
    'C(occupation)[T.4.0]': 'occ_4',
    'C(occupation)[T.5.0]': 'occ_5',
    'C(occupation)[T.6.0]': 'occ_6',
    'C(occupation_husb)[T.2.0]': 'occ_husb_2',
    'C(occupation_husb)[T.3.0]': 'occ_husb_3',
    'C(occupation_husb)[T.4.0]': 'occ_husb_4',
    'C(occupation_husb)[T.5.0]': 'occ_husb_5',
    'C(occupation_husb)[T.6.0]': 'occ_husb_6'
})

# Flatten the target array
y = np.ravel(y)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and fit the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1_score = metrics.f1_score(y_test, y_pred)
roc_auc = metrics.roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

# Print evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1_score:.2f}')
print(f'ROC-AUC Score: {roc_auc:.2f}')

# Cross-validation score for more robust evaluation
cv_scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
print(f'Cross-validated Accuracy: {np.mean(cv_scores):.2f}')


Accuracy: 0.72
Precision: 0.64
Recall: 0.38
F1 Score: 0.48
ROC-AUC Score: 0.76
Cross-validated Accuracy: 0.72
