# Binary Classification with Neural Networks
## Circles Dataset

### 1. Data Retrieval and Inspection

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv('circles_binary_classification.csv')
df.head()

In [None]:
df.describe()

In [None]:
print(f"Shape: {df.shape}")
print(f"\nClass distribution:\n{df['label'].value_counts()}")

### 2. Data Cleaning & Feature Design

The data is clean, so no preprocessing needed.

In [None]:
# Extract features (X) and labels (y)
X = df[['X1', 'X2']].values
y = df['label'].values

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

In [None]:
# Convert to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

print(f"X tensor shape: {X.shape}")
print(f"y tensor shape: {y.shape}")

### 3. Visualize Data

In [None]:
# Plot the data
plt.figure(figsize=(8, 6))
plt.scatter(X[y == 0, 0], X[y == 0, 1], c='blue', label='Class 0', alpha=0.6)
plt.scatter(X[y == 1, 0], X[y == 1, 1], c='red', label='Class 1', alpha=0.6)
plt.xlabel('X1')
plt.ylabel('X2')
plt.title('Circles Dataset')
plt.legend()
plt.show()

### 4. Train/Test Split

In [None]:
# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

### 5. Device & Dtype Setup

In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Move data to device
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

### 6. Implement Models

In [None]:
# ModelV0: 2 → 5 → 1 (no activation)
class ModelV0(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer_1 = nn.Linear(in_features=2, out_features=5)
        self.layer_2 = nn.Linear(in_features=5, out_features=1)
    
    def forward(self, x):
        return self.layer_2(self.layer_1(x))

model_0 = ModelV0().to(device)
model_0

In [None]:
# ModelV1: 2 → 15 → 15 → 1 (no activation)
class ModelV1(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer_1 = nn.Linear(in_features=2, out_features=15)
        self.layer_2 = nn.Linear(in_features=15, out_features=15)
        self.layer_3 = nn.Linear(in_features=15, out_features=1)
    
    def forward(self, x):
        return self.layer_3(self.layer_2(self.layer_1(x)))

model_1 = ModelV1().to(device)
model_1

In [None]:
# ModelV2: 2 → 64 → 64 → 10 → 1 (with ReLU)
class ModelV2(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer_1 = nn.Linear(in_features=2, out_features=64)
        self.layer_2 = nn.Linear(in_features=64, out_features=64)
        self.layer_3 = nn.Linear(in_features=64, out_features=10)
        self.layer_4 = nn.Linear(in_features=10, out_features=1)
        self.relu = nn.ReLU()  # ReLU activation function
    
    def forward(self, x):
        # Add ReLU between layers
        return self.layer_4(self.relu(self.layer_3(self.relu(self.layer_2(self.relu(self.layer_1(x)))))))

model_2 = ModelV2().to(device)
model_2

### 7. Loss Function, Optimizer & Metrics

In [None]:
# Loss function (BCEWithLogitsLoss has sigmoid built-in)
loss_fn = nn.BCEWithLogitsLoss()

# Accuracy function
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100
    return acc

### 8. Training Loop

In [None]:
# Training and testing function
def train_and_test_loop(
    model: nn.Module,
    epochs: int,
    X_train: torch.Tensor,
    y_train: torch.Tensor,
    X_test: torch.Tensor,
    y_test: torch.Tensor,
    loss_fn: nn.Module,
    optimizer: torch.optim.Optimizer
):
    # Lists to store results
    loss_list = []
    acc_list = []
    test_losses = []
    test_acc_list = []

    # Training loop
    for epoch in range(epochs):
        ### Training
        model.train()

        # 1. Forward pass (model outputs raw logits)
        y_logits = model(X_train).squeeze()
        y_pred = torch.round(torch.sigmoid(y_logits))  # Convert logits to predictions

        # 2. Calculate loss and accuracy
        loss = loss_fn(y_logits, y_train)
        acc = accuracy_fn(y_true=y_train, y_pred=y_pred)

        # 3. Zero gradients
        optimizer.zero_grad()

        # 4. Backpropagation
        loss.backward()

        # 5. Update weights
        optimizer.step()

        ### Testing
        model.eval()
        with torch.inference_mode():
            # 1. Forward pass
            test_logits = model(X_test).squeeze()
            test_pred = torch.round(torch.sigmoid(test_logits))
            
            # 2. Calculate loss and accuracy
            test_loss = loss_fn(test_logits, y_test)
            test_acc = accuracy_fn(y_true=y_test, y_pred=test_pred)

        # Store results
        loss_list.append(loss.item())
        acc_list.append(acc)
        test_losses.append(test_loss.item())
        test_acc_list.append(test_acc)

        # Print progress every 10 epochs
        if epoch % 10 == 0:
            print(f"Epoch: {epoch} | Loss: {loss:.5f}, Accuracy: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")

    return loss_list, acc_list, test_losses, test_acc_list

### Helper Functions for Visualization

In [None]:
# Plot decision boundary
def plot_decision_boundary(model, X, y):
    # Move data to CPU for plotting
    X_np = X.cpu().numpy()
    y_np = y.cpu().numpy()
    
    # Create mesh
    x_min, x_max = X_np[:, 0].min() - 0.5, X_np[:, 0].max() + 0.5
    y_min, y_max = X_np[:, 1].min() - 0.5, X_np[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
    
    # Make predictions
    model.eval()
    with torch.inference_mode():
        Z = model(torch.tensor(np.c_[xx.ravel(), yy.ravel()], dtype=torch.float32).to(device))
        Z = torch.sigmoid(Z).cpu().numpy()
    Z = Z.reshape(xx.shape)
    
    # Plot
    plt.contourf(xx, yy, Z, alpha=0.4, cmap='RdYlBu')
    plt.scatter(X_np[y_np == 0, 0], X_np[y_np == 0, 1], c='blue', alpha=0.6, edgecolors='k')
    plt.scatter(X_np[y_np == 1, 0], X_np[y_np == 1, 1], c='red', alpha=0.6, edgecolors='k')

# Plot loss curves
def plot_loss_curves(train_losses, test_losses):
    epochs = range(len(train_losses))
    
    plt.figure(figsize=(8, 5))
    plt.plot(epochs, train_losses, label='Train Loss')
    plt.plot(epochs, test_losses, label='Test Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss Curves')
    plt.legend()
    plt.show()

### 9. Training ModelV0

In [None]:
# Set seed for reproducibility
torch.manual_seed(42)

# Reset model and optimizer
model = ModelV0().to(device)
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.1)

# Show untrained predictions
print("Untrained predictions:")
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title("Train (Untrained)")
plot_decision_boundary(model, X_train, y_train)
plt.subplot(1, 2, 2)
plt.title("Test (Untrained)")
plot_decision_boundary(model, X_test, y_test)
plt.show()

In [None]:
# Train ModelV0
print("\nTraining ModelV0 (100 epochs)...")
train_losses, acc_list, test_losses, test_acc = train_and_test_loop(
    model=model,
    epochs=100,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    loss_fn=loss_fn,
    optimizer=optimizer
)

# Plot decision boundaries after training
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title("Train")
plot_decision_boundary(model, X_train, y_train)
plt.subplot(1, 2, 2)
plt.title("Test")
plot_decision_boundary(model, X_test, y_test)
plt.show()

plot_loss_curves(train_losses, test_losses)

**Note:** ModelV0 is **underfitting** - it can't learn the circular pattern because it has no activation functions.

### Training ModelV1

In [None]:
# Reset model and optimizer
torch.manual_seed(42)
model = ModelV1().to(device)
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.1)

# Train ModelV1
print("Training ModelV1 (1000 epochs)...")
train_losses, acc_list, test_losses, test_acc = train_and_test_loop(
    model=model,
    epochs=1000,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    loss_fn=loss_fn,
    optimizer=optimizer
)

# Plot decision boundaries
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title("Train")
plot_decision_boundary(model, X_train, y_train)
plt.subplot(1, 2, 2)
plt.title("Test")
plot_decision_boundary(model, X_test, y_test)
plt.show()

plot_loss_curves(train_losses, test_losses)

**Note:** ModelV1 still struggles. Even with more layers, without activation functions, it can only create linear boundaries.

### Training ModelV2 (with ReLU)

In [None]:
# Reset model and optimizer
torch.manual_seed(42)
model = ModelV2().to(device)
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.1)

# Train ModelV2
print("Training ModelV2 (1500 epochs)...")
train_losses, acc_list, test_losses, test_acc = train_and_test_loop(
    model=model,
    epochs=1500,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    loss_fn=loss_fn,
    optimizer=optimizer
)

# Plot decision boundaries
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title("Train")
plot_decision_boundary(model, X_train, y_train)
plt.subplot(1, 2, 2)
plt.title("Test")
plot_decision_boundary(model, X_test, y_test)
plt.show()

plot_loss_curves(train_losses, test_losses)

**Result:** ModelV2 performs much better! ReLU activation allows it to learn non-linear patterns.

### Extra Credit: Adam vs SGD

In [None]:
# Train with Adam optimizer
torch.manual_seed(42)
model_adam = ModelV2().to(device)
optimizer_adam = torch.optim.Adam(params=model_adam.parameters(), lr=0.01)

print("Training ModelV2 with Adam (500 epochs)...")
train_losses_adam, acc_list_adam, test_losses_adam, test_acc_adam = train_and_test_loop(
    model=model_adam,
    epochs=500,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    loss_fn=loss_fn,
    optimizer=optimizer_adam
)

# Compare Adam vs SGD
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses[:500], label='SGD Train')
plt.plot(test_losses[:500], label='SGD Test')
plt.plot(train_losses_adam, label='Adam Train', linestyle='--')
plt.plot(test_losses_adam, label='Adam Test', linestyle='--')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss: Adam vs SGD')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(acc_list[:500], label='SGD Train Acc')
plt.plot(test_acc[:500], label='SGD Test Acc')
plt.plot(acc_list_adam, label='Adam Train Acc', linestyle='--')
plt.plot(test_acc_adam, label='Adam Test Acc', linestyle='--')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy: Adam vs SGD')
plt.legend()
plt.tight_layout()
plt.show()

**Note:** Adam usually converges faster than SGD.

## Discussion and Conclusion

### What We Learned:

**1. Why Activation Functions Are Important:**
- ModelV0 and ModelV1 (no activation) could only make **straight lines** to separate data
- ModelV2 (with ReLU) could make **curved boundaries** to follow the circles
- Without activation functions, the model can't learn complex patterns

**2. Underfitting:**
- Models without activation are **underfitting** - they're too simple
- They can't capture the circular pattern in our data
- Even adding more layers doesn't help without activation functions

**3. Model Performance:**
- ModelV0: ~50% accuracy (just guessing)
- ModelV1: ~50% accuracy (still just guessing)
- ModelV2: ~95-100% accuracy (actually learning!)

**4. Optimizer Comparison:**
- Adam optimizer learns faster than SGD
- Both can reach good results, but Adam needs fewer epochs

### Key Takeaway:
**Always use activation functions (like ReLU) between layers!** Without them, your neural network can only draw straight lines, no matter how many layers you add.