In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [6]:
df = pd.read_csv('/Users/Devyani/msads/ml2/finalproj/mlp_files/compas_sample_200.csv')

In [8]:
df['prison_in'] = pd.to_datetime(df['prison_in'], errors='coerce')
df['prison_out'] = pd.to_datetime(df['prison_out'], errors='coerce')
df['prison_in_year'] = df['prison_in'].dt.year
df['prison_out_year'] = df['prison_out'].dt.year
df = df.drop(columns=['person_id', 'prison_in', 'prison_out'])

In [10]:
y = df['is_recid'].values
X = df.drop(columns=['is_recid'])

In [12]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [14]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
])

In [16]:
X_transformed = preprocessor.fit_transform(X)

In [18]:
X_tensor = torch.tensor(X_transformed, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)

In [20]:
class CompasDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [22]:
dataset = CompasDataset(X_tensor, y_tensor)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [24]:
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

In [26]:
model = MLP(X_tensor.shape[1])
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [28]:
for epoch in range(10):
    for batch_X, batch_y in train_loader:
        pred = model(batch_X)
        loss = criterion(pred, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 1, Loss: 0.5848
Epoch 2, Loss: 0.5793
Epoch 3, Loss: 0.6519
Epoch 4, Loss: 0.5632
Epoch 5, Loss: 0.6388
Epoch 6, Loss: 0.4057
Epoch 7, Loss: 0.7179
Epoch 8, Loss: 0.4183
Epoch 9, Loss: 0.3828
Epoch 10, Loss: 0.6757


### evaluation

In [31]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

In [33]:
model.eval()


MLP(
  (model): Sequential(
    (0): Linear(in_features=279, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

In [35]:
with torch.no_grad():
    y_pred_probs = model(X_tensor).numpy().flatten()
    y_preds = (y_pred_probs >= 0.5).astype(int)
    y_true = y_tensor.numpy().flatten().astype(int)

In [37]:
accuracy = accuracy_score(y_true, y_preds)
print(f"\nAccuracy: {accuracy:.4f}")


Accuracy: 0.7600


In [39]:
print("\nClassification Report:")
print(classification_report(y_true, y_preds))


Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        48
           1       0.76      1.00      0.86       152

    accuracy                           0.76       200
   macro avg       0.38      0.50      0.43       200
weighted avg       0.58      0.76      0.66       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [41]:
cm = confusion_matrix(y_true, y_preds)
print("\nConfusion Matrix:")
print(cm)


Confusion Matrix:
[[  0  48]
 [  0 152]]
