In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

print("All imports done")

All imports done


In [2]:
# Load data from Part A
df = pd.read_csv('Wisconsin.csv')
y = df['target'].values
X = df.drop('target', axis=1).values
feature_names = df.drop('target', axis=1).columns.tolist()

print(f"Data loaded. Shape: X={X.shape}, y={y.shape}")

Data loaded. Shape: X=(569, 30), y=(569,)


In [3]:
# Standardize features - same as Part A
def standardize(X):
    mean_vals = np.mean(X, axis=0)
    std_vals = np.std(X, axis=0)
    std_vals[std_vals == 0] = 1
    X_std = (X - mean_vals) / std_vals
    return X_std, mean_vals, std_vals

X_scaled, mean, std = standardize(X)
print("Features standardized")

Features standardized


In [4]:
# Functions from Part B
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def hypothesis(X, w, b):
    z = np.dot(X, w) + b
    return sigmoid(z)

def compute_cost(X, y, w, b):
    m = X.shape[0]
    h = hypothesis(X, w, b)
    epsilon = 1e-15
    h = np.clip(h, epsilon, 1 - epsilon)
    cost = -np.mean(y * np.log(h) + (1 - y) * np.log(1 - h))
    return cost

def compute_gradients(X, y, w, b):
    m = X.shape[0]
    h = hypothesis(X, w, b)
    error = h - y
    dw = np.dot(X.T, error) / m
    db = np.mean(error)
    return dw, db

print("Functions loaded from Part B")

Functions loaded from Part B


---
# Part C1: Manual Data Partitioning (2 Marks)

Implement a train/test split from scratch without external libraries

In [5]:
def train_test_split(X, y, test_size=0.2, random_state=None):
    """
    Split data into train and test sets manually using numpy
    
    Args:
        X: features
        y: target
        test_size: fraction for test set (default 0.2 = 80/20 split)
        random_state: seed for reproducibility
    
    Returns:
        X_train, X_test, y_train, y_test
    """
    # Set seed if provided
    if random_state is not None:
        np.random.seed(random_state)
    
    m = X.shape[0]
    
    # Create shuffled indices
    indices = np.arange(m)
    np.random.shuffle(indices)
    
    # Calculate split point
    split_idx = int((1 - test_size) * m)
    
    # Split indices into train and test
    train_indices = indices[:split_idx]
    test_indices = indices[split_idx:]
    
    # Use indices to split X and y
    X_train = X[train_indices]
    X_test = X[test_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]
    
    return X_train, X_test, y_train, y_test

print("Train-test split function created")

Train-test split function created


In [6]:
# Apply the split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Total samples: {X_train.shape[0] + X_test.shape[0]}")
print(f"\nTrain/Test ratio: {X_train.shape[0] / (X_train.shape[0] + X_test.shape[0]) * 100:.1f}% / {X_test.shape[0] / (X_train.shape[0] + X_test.shape[0]) * 100:.1f}%")

Training set: 455 samples
Test set: 114 samples
Total samples: 569

Train/Test ratio: 80.0% / 20.0%


In [7]:
# Check class distribution in train and test
train_class_0 = (y_train == 0).sum()
train_class_1 = (y_train == 1).sum()
test_class_0 = (y_test == 0).sum()
test_class_1 = (y_test == 1).sum()

print("Training set class distribution:")
print(f"  Benign (0): {train_class_0} ({train_class_0/len(y_train)*100:.1f}%)")
print(f"  Malignant (1): {train_class_1} ({train_class_1/len(y_train)*100:.1f}%)")

print("\nTest set class distribution:")
print(f"  Benign (0): {test_class_0} ({test_class_0/len(y_test)*100:.1f}%)")
print(f"  Malignant (1): {test_class_1} ({test_class_1/len(y_test)*100:.1f}%)")

Training set class distribution:
  Benign (0): 290 (63.7%)
  Malignant (1): 165 (36.3%)

Test set class distribution:
  Benign (0): 67 (58.8%)
  Malignant (1): 47 (41.2%)


---
# Part C2: Fitting the Model (3 Marks)

Execute Gradient Descent using the training partition to estimate optimal parameters

In [8]:
def gradient_descent(X, y, iterations=1000, learning_rate=0.01):
    """
    Train logistic regression model using gradient descent
    
    Args:
        X: training features
        y: training labels
        iterations: number of iterations
        learning_rate: step size
    
    Returns:
        w: learned weights
        b: learned bias
        costs: cost history for plotting
    """
    m, n = X.shape
    
    # Initialize parameters
    w = np.zeros(n)
    b = 0
    
    costs = []
    
    # Training loop
    for i in range(iterations):
        # Compute gradients
        dw, db = compute_gradients(X, y, w, b)
        
        # Update weights and bias
        w = w - learning_rate * dw
        b = b - learning_rate * db
        
        # Store cost
        cost = compute_cost(X, y, w, b)
        costs.append(cost)
        
        if (i + 1) % 200 == 0:
            print(f"Iteration {i+1}: Cost = {cost:.6f}")
    
    return w, b, costs

print("Gradient descent function ready")

Gradient descent function ready


In [None]:
# Train the model on training set
print("Starting training on training set...\n")
w_final, b_final, cost_history = gradient_descent(X_train, y_train, iterations=3000, learning_rate=0.1)

print(f"\nTraining complete!")
print(f"Initial cost: {cost_history[0]:.6f}")
print(f"Final cost: {cost_history[-1]:.6f}")
print(f"Cost reduction: {cost_history[0] - cost_history[-1]:.6f}")

In [None]:
# Plot learning curve on training set
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(cost_history, 'b-', linewidth=2)
plt.xlabel('Iteration')
plt.ylabel('Cost')
plt.title('Training Cost vs Iterations')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(cost_history[-500:], 'r-', linewidth=2)
plt.xlabel('Iteration (Last 500)')
plt.ylabel('Cost')
plt.title('Training Cost - Last 500 Iterations')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Training curves plotted")

In [None]:
# Check model on training data
train_predictions_prob = hypothesis(X_train, w_final, b_final)
print(f"Training set predictions statistics:")
print(f"  Min probability: {train_predictions_prob.min():.6f}")
print(f"  Max probability: {train_predictions_prob.max():.6f}")
print(f"  Mean probability: {train_predictions_prob.mean():.6f}")

# Check training cost
train_cost = compute_cost(X_train, y_train, w_final, b_final)
print(f"\nTraining cost: {train_cost:.6f}")

---
# Part C3: Generating Predictions (1 Mark)

Use the optimized parameters to classify unseen test observations using a decision threshold

In [None]:
def make_predictions(X, w, b, threshold=0.5):
    """
    Make binary predictions from probabilities
    
    Args:
        X: features
        w: weights
        b: bias
        threshold: decision threshold (default 0.5)
    
    Returns:
        predictions: binary class predictions (0 or 1)
        probabilities: predicted probabilities
    """
    probabilities = hypothesis(X, w, b)
    predictions = (probabilities >= threshold).astype(int)
    return predictions, probabilities

print("Prediction function created")

In [None]:
# Make predictions on test set
y_pred_test, y_pred_prob_test = make_predictions(X_test, w_final, b_final, threshold=0.5)

print(f"Test set predictions made")
print(f"\nPredictions summary:")
print(f"  Predicted Benign (0): {(y_pred_test == 0).sum()}")
print(f"  Predicted Malignant (1): {(y_pred_test == 1).sum()}")
print(f"\nActual distribution:")
print(f"  Actual Benign (0): {(y_test == 0).sum()}")
print(f"  Actual Malignant (1): {(y_test == 1).sum()}")

In [None]:
# Show some sample predictions
print("Sample predictions on test set (first 10):")
print("\nActual | Predicted | Probability")
print("-" * 40)
for i in range(10):
    actual = y_test[i]
    pred = y_pred_test[i]
    prob = y_pred_prob_test[i]
    match = "✓" if actual == pred else "✗"
    print(f"  {actual}    |     {pred}      |  {prob:.4f}  {match}")

---
# Part C4: Assessment Metrics (3 Marks)

Determine Accuracy, Precision, and Recall from scratch

**Clinical Perspective on Recall:**
- Recall = True Positive / (True Positive + False Negative)
- In oncology screening, overlooking true cases (False Negative) has grave consequences
- High Recall prioritizes case identification
- Better to have unconfirmed suspicious findings than to miss actual pathology

In [None]:
def compute_metrics(y_true, y_pred):
    """
    Compute evaluation metrics manually
    
    Args:
        y_true: actual labels
        y_pred: predicted labels
    
    Returns:
        accuracy, precision, recall
    """
    # Calculate confusion matrix components
    TP = np.sum((y_true == 1) & (y_pred == 1))  # True Positives
    TN = np.sum((y_true == 0) & (y_pred == 0))  # True Negatives
    FP = np.sum((y_true == 0) & (y_pred == 1))  # False Positives
    FN = np.sum((y_true == 1) & (y_pred == 0))  # False Negatives
    
    # Accuracy: correct predictions / all predictions
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    
    # Precision: correct positive predictions / all positive predictions
    # Measures how many predicted malignant are actually malignant
    if (TP + FP) == 0:
        precision = 0
    else:
        precision = TP / (TP + FP)
    
    # Recall: correct positive predictions / all actual positives
    # Measures how many actual malignant cases we detected
    if (TP + FN) == 0:
        recall = 0
    else:
        recall = TP / (TP + FN)
    
    # Also return confusion matrix
    cm = {'TP': TP, 'TN': TN, 'FP': FP, 'FN': FN}
    
    return accuracy, precision, recall, cm

print("Metrics function created")

In [None]:
# Calculate metrics on test set
accuracy, precision, recall, cm = compute_metrics(y_test, y_pred_test)

print("="*50)
print("TEST SET EVALUATION METRICS")
print("="*50)
print(f"\nAccuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f} ({precision*100:.2f}%)")
print(f"Recall:    {recall:.4f} ({recall*100:.2f}%)")
print(f"\nConfusion Matrix:")
print(f"  True Positives (TP):   {cm['TP']}")
print(f"  True Negatives (TN):   {cm['TN']}")
print(f"  False Positives (FP):  {cm['FP']}")
print(f"  False Negatives (FN):  {cm['FN']}")

In [None]:
# Also check on training set
y_pred_train, _ = make_predictions(X_train, w_final, b_final, threshold=0.5)
train_acc, train_prec, train_rec, train_cm = compute_metrics(y_train, y_pred_train)

print("="*50)
print("TRAINING SET EVALUATION METRICS")
print("="*50)
print(f"\nAccuracy:  {train_acc:.4f} ({train_acc*100:.2f}%)")
print(f"Precision: {train_prec:.4f} ({train_prec*100:.2f}%)")
print(f"Recall:    {train_rec:.4f} ({train_rec*100:.2f}%)")

In [None]:
# Visualize confusion matrix
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Training set
train_cm_array = np.array([[train_cm['TN'], train_cm['FP']], 
                            [train_cm['FN'], train_cm['TP']]])

axes[0].imshow(train_cm_array, cmap='Blues', aspect='auto')
axes[0].set_xticks([0, 1])
axes[0].set_yticks([0, 1])
axes[0].set_xticklabels(['Predicted 0', 'Predicted 1'])
axes[0].set_yticklabels(['Actual 0', 'Actual 1'])
axes[0].set_title('Training Set Confusion Matrix')

# Add text annotations
for i in range(2):
    for j in range(2):
        axes[0].text(j, i, str(train_cm_array[i, j]), 
                    ha='center', va='center', color='white', fontsize=14, fontweight='bold')

# Test set
test_cm_array = np.array([[cm['TN'], cm['FP']], 
                           [cm['FN'], cm['TP']]])

axes[1].imshow(test_cm_array, cmap='Greens', aspect='auto')
axes[1].set_xticks([0, 1])
axes[1].set_yticks([0, 1])
axes[1].set_xticklabels(['Predicted 0', 'Predicted 1'])
axes[1].set_yticklabels(['Actual 0', 'Actual 1'])
axes[1].set_title('Test Set Confusion Matrix')

# Add text annotations
for i in range(2):
    for j in range(2):
        axes[1].text(j, i, str(test_cm_array[i, j]), 
                    ha='center', va='center', color='white', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("Confusion matrices plotted")

In [None]:
# Compare metrics side by side
fig, ax = plt.subplots(figsize=(10, 5))

metrics = ['Accuracy', 'Precision', 'Recall']
train_scores = [train_acc, train_prec, train_rec]
test_scores = [accuracy, precision, recall]

x = np.arange(len(metrics))
width = 0.35

ax.bar(x - width/2, train_scores, width, label='Training Set', color='skyblue', edgecolor='black')
ax.bar(x + width/2, test_scores, width, label='Test Set', color='lightcoral', edgecolor='black')

ax.set_ylabel('Score')
ax.set_title('Model Performance Metrics Comparison')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.set_ylim([0, 1.1])
ax.grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, (train, test) in enumerate(zip(train_scores, test_scores)):
    ax.text(i - width/2, train + 0.02, f'{train:.3f}', ha='center', va='bottom', fontweight='bold')
    ax.text(i + width/2, test + 0.02, f'{test:.3f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("Metrics comparison plotted")

## Why Recall is Crucial in Cancer Detection

In our test set:
- **Recall = {:.4f} ({:.2f}%)**

This means we correctly identified **{:.0f} out of {:.0f}** actual malignant cases.

**Why this matters for healthcare:**

1. **False Negatives are Dangerous**: Missing a cancer case (FN={}) could delay critical treatment
2. **False Positives are Less Harmful**: Extra screening (FP={}) is inconvenient but safe
3. **Better to Err on Safe Side**: In medical diagnosis, we'd rather have more false alarms than miss real cases
4. **Recall vs Precision Trade-off**: High recall sometimes means lower precision, but that's acceptable here

**Our Model's Performance:**
- Caught **{:.1f}%** of actual cancer cases
- This is a strong recall for medical application
- Only {:.0f} cases were missed (False Negatives)
""".format(recall, recall*100, cm['TP'], cm['TP']+cm['FN'], cm['FN'], cm['FP'],
          recall*100, cm['FN'])

In [None]:
# Display comprehensive summary
print("\n" + "="*60)
print("PART C SUMMARY: MODEL TRAINING & EVALUATION")
print("="*60)

print("\n✓ TRAIN-TEST SPLIT:")
print(f"  Training samples: {X_train.shape[0]} (80%)")
print(f"  Test samples: {X_test.shape[0]} (20%)")

print("\n✓ MODEL TRAINING:")
print(f"  Iterations: 3000")
print(f"  Learning rate: 0.1")
print(f"  Initial cost: {cost_history[0]:.6f}")
print(f"  Final cost: {cost_history[-1]:.6f}")
print(f"  Improvement: {(cost_history[0]-cost_history[-1])/cost_history[0]*100:.2f}%")

print("\n✓ TEST SET PREDICTIONS:")
print(f"  Total samples: {len(y_test)}")
print(f"  Threshold: 0.5")

print("\n✓ EVALUATION METRICS (TEST SET):")
print(f"  Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"  Precision: {precision:.4f} ({precision*100:.2f}%)")
print(f"  Recall:    {recall:.4f} ({recall*100:.2f}%)")

print("\n✓ CONFUSION MATRIX (TEST SET):")
print(f"  TP (Correctly identified malignant): {cm['TP']}")
print(f"  TN (Correctly identified benign): {cm['TN']}")
print(f"  FP (False alarms): {cm['FP']}")
print(f"  FN (Missed cancer cases): {cm['FN']}")

print("\n" + "="*60)
print("Ready for Part D: Business & Healthcare Insights")
print("="*60)