In [2]:
import numpy as np

import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim

import statsmodels.api as sm

## Generate Data

In [3]:
# Generate synthetic data
X, y = make_classification(n_samples=5000, n_features=5, n_informative=3, n_redundant=0, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to pandas DataFrame for statsmodels
fnames = 'feature' + pd.Series(range(1, 6)).astype(str)
df_train = pd.DataFrame(X_train, columns=fnames)
df_train['target'] = y_train
df_test = pd.DataFrame(X_test, columns=fnames)
df_test['target'] = y_test


# Prepare data for PyTorch
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

## Logistic Regression with PyTorch

In [4]:
# Set random seed for reproducibility
torch.manual_seed(42)

# Define the logistic regression model
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)  # Output raw logits


# Initialize the model, loss function, and optimizer
input_dim = X_train.shape[1]
model = LogisticRegressionModel(input_dim)
objective = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Train the model
num_epochs = 2000  # depending on settings, this may need to be large

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = objective(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

# Make predictions
model.eval()
with torch.no_grad():
    y_pred_train_logits = model(X_train_tensor)
    y_pred_test_logits = model(X_test_tensor)
    y_pred_train = torch.sigmoid(y_pred_train_logits).round()
    y_pred_test = torch.sigmoid(y_pred_test_logits).round()


## Logistic Regression with statsmodels

In [5]:
# Add a constant to the features for the intercept
X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

# Fit the model
model_sm = sm.Logit(y_train, X_train_sm).fit()

# Make predictions
y_pred_train_sm = model_sm.predict(X_train_sm).round()
y_pred_test_sm = model_sm.predict(X_test_sm).round()


# Print model summary
# print(model_sm.summary())

Optimization terminated successfully.
         Current function value: 0.262415
         Iterations 8


##  Compare Results

### Accuracy

In [6]:
# Calculate pytorch accuracy
train_accuracy = accuracy_score(y_train, y_pred_train.numpy())
test_accuracy = accuracy_score(y_test, y_pred_test.numpy())

In [7]:
# Calculate statsmodels accuracy
train_accuracy_sm = accuracy_score(y_train, y_pred_train_sm)
test_accuracy_sm = accuracy_score(y_test, y_pred_test_sm)

In [8]:
accuracy_df = pd.DataFrame({
    'Model': ['PyTorch', 'PyTorch', 'statsmodels', 'statsmodels'],
    'Dataset': ['Train', 'Test', 'Train', 'Test'],
    'Accuracy': [train_accuracy, test_accuracy, train_accuracy_sm, test_accuracy_sm]
})

accuracy_df

Unnamed: 0,Model,Dataset,Accuracy
0,PyTorch,Train,0.891
1,PyTorch,Test,0.899
2,statsmodels,Train,0.891
3,statsmodels,Test,0.899


### Coefficients

In [9]:
# Compare coefficients
ptcoef = model.linear.weight.detach().numpy()
ptintercept = model.linear.bias.detach().numpy()

# Create a DataFrame for coefficients
coefficients_df = pd.DataFrame({
    'Coefficient': ['Intercept'] + fnames.tolist(),
    'PyTorch': np.concatenate(([ptintercept[0]], ptcoef[0])),
    'statsmodels': model_sm.params
})

coefficients_df.round(3)

Unnamed: 0,Coefficient,PyTorch,statsmodels
0,Intercept,0.198,0.198
1,feature1,0.035,0.035
2,feature2,0.04,0.04
3,feature3,2.399,2.399
4,feature4,0.392,0.392
5,feature5,0.639,0.64
