# Required Packages Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style for better visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

---

## A1: Loading the Dataset (1 Mark)

Begin by reading the Wisconsin Breast Cancer database and performing initial exploration.

### About the Dataset:
- **Origin**: Wisconsin Breast Cancer Database  
- **Dimensions**: 30 numeric attributes computed from microscopy images
- **Class Variable**: Binary outcome (1 = Malignant, 0 = Benign)  
- **Observations**: Approximately 569 patient cases

In [None]:
# Load the dataset
dataset_path = 'Wisconsin.csv'
df = pd.read_csv(dataset_path)

print("Dataset Loaded Successfully!")
print(f"Shape: {df.shape}")
print(f"\nColumn Names ({len(df.columns)} features):")
print(df.columns.tolist())

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
display(df.head())

In [None]:
# Display basic statistics
print("Dataset Statistics:")
display(df.describe())

In [None]:
# Display data types
print("Data Types:")
print(df.dtypes)

---

## A2: Preprocessing the Data (2 Marks)

Before proceeding with analysis, we need to address missing information and separate input/output variables properly.

### Main Tasks:
1. Inspect for null or incomplete records
2. Remove any non-predictor columns (IDs, etc.)
3. Divide into predictors (X) and target variable (y)

In [None]:
# Check for missing values
print("Missing Values Report:")
missing_values = df.isnull().sum()
print(missing_values)
print(f"\nTotal missing values: {missing_values.sum()}")

In [None]:
# Handle missing values (if any)
# Since this dataset typically has no missing values, we'll demonstrate the approach
print("Dataset before cleaning:")
print(f"Shape: {df.shape}")

# Remove any rows with missing values (best practice for medical data)
df_cleaned = df.dropna()
print(f"\nDataset after removing missing values:")
print(f"Shape: {df_cleaned.shape}")
print(f"Rows removed: {df.shape[0] - df_cleaned.shape[0]}")

In [None]:
# Separate target from features
# The 'target' column contains the diagnosis (1=Malignant, 0=Benign)
y = df_cleaned['target'].values  # Convert to numpy array
X = df_cleaned.drop('target', axis=1).values  # Drop target, convert to numpy array

feature_names = df_cleaned.drop('target', axis=1).columns.tolist()

print(f"Features shape (X): {X.shape}")
print(f"Target shape (y): {y.shape}")
print(f"\nFeature names ({len(feature_names)} features):")
for i, name in enumerate(feature_names, 1):
    print(f"  {i}. {name}")

In [None]:
# Analyze target distribution
unique_classes, class_counts = np.unique(y, return_counts=True)
print("Class Distribution:")
print(f"Benign (0): {class_counts[0]} samples ({class_counts[0]/len(y)*100:.2f}%)")
print(f"Malignant (1): {class_counts[1]} samples ({class_counts[1]/len(y)*100:.2f}%)")
print(f"Total: {len(y)} samples")

---

## A3: Standardizing Features (2 Marks)

It's essential to rescale features to a common range for improved algorithm convergence.

### When to Scale:
- Different attributes have vastly different units and ranges (e.g., area spans 0-2500 while smoothness is 0-0.3)
- Rescaling accelerates the convergence of optimization algorithms
- Prevents high-magnitude attributes from overwhelming the analysis

### Available Approaches:
1. **Range Scaling (Min-Max)**: Compresses values to [0, 1]
2. **Centering and Scaling (Standardization)**: Transforms to mean 0, variance 1

### 3.1: Min-Max Scaling

**The Equation**:
$$X_{normalized} = \frac{X - X_{min}}{X_{max} - X_{min}}$$

This transformation maps all data points to [0, 1] interval.

In [None]:
def min_max_normalize(X):
    """
    Apply Min-Max normalization to features.
    
    Parameters:
    X (np.ndarray): Feature matrix of shape (m, n) where m=samples, n=features
    
    Returns:
    X_normalized (np.ndarray): Normalized feature matrix
    min_vals (np.ndarray): Minimum values for each feature (for inverse transform)
    max_vals (np.ndarray): Maximum values for each feature (for inverse transform)
    """
    # Calculate min and max for each feature (column-wise)
    min_vals = np.min(X, axis=0)
    max_vals = np.max(X, axis=0)
    
    # Avoid division by zero
    ranges = max_vals - min_vals
    ranges[ranges == 0] = 1  # If feature has no variation, set range to 1
    
    # Apply normalization
    X_normalized = (X - min_vals) / ranges
    
    return X_normalized, min_vals, max_vals

# Apply Min-Max normalization
X_minmax, X_min, X_max = min_max_normalize(X)

print("Min-Max Normalization Applied!")
print(f"\nNormalized Data Range: [{X_minmax.min():.4f}, {X_minmax.max():.4f}]")
print(f"Original Data Range: [{X.min():.4f}, {X.max():.4f}]")

In [None]:
# Verify normalization on first 3 features
print("Verification - First 3 features (first 5 samples):\n")
print("Original Data:")
print(X[:5, :3])
print("\nNormalized Data:")
print(X_minmax[:5, :3])
print("\nFeature-wise Min and Max after normalization:")
for i in range(3):
    print(f"Feature {i+1}: Min={X_minmax[:, i].min():.6f}, Max={X_minmax[:, i].max():.6f}")

### 3.2: Standardization (Z-score Transformation)

**The Equation**:
$$X_{standardized} = \frac{X - \mu}{\sigma}$$

Here:
- $\mu$ = feature mean
- $\sigma$ = feature standard deviation

Result: Data centered at origin with unit standard deviation.

In [None]:
def standardize(X):
    """
    Apply Standardization (Z-score normalization) to features.
    
    Parameters:
    X (np.ndarray): Feature matrix of shape (m, n)
    
    Returns:
    X_standardized (np.ndarray): Standardized feature matrix
    mean_vals (np.ndarray): Mean values for each feature
    std_vals (np.ndarray): Standard deviation values for each feature
    """
    # Calculate mean and standard deviation for each feature
    mean_vals = np.mean(X, axis=0)
    std_vals = np.std(X, axis=0)
    
    # Avoid division by zero
    std_vals[std_vals == 0] = 1
    
    # Apply standardization
    X_standardized = (X - mean_vals) / std_vals
    
    return X_standardized, mean_vals, std_vals

# Apply Standardization
X_standard, X_mean, X_std = standardize(X)

print("Standardization Applied!")
print(f"\nStandardized Data Mean: {X_standard.mean():.6f}")
print(f"Standardized Data Std Dev: {X_standard.std():.6f}")
print(f"\nOriginal Data Mean: {X.mean():.6f}")
print(f"Original Data Std Dev: {X.std():.6f}")

In [None]:
# Verify standardization on first 3 features
print("Verification - First 3 features (first 5 samples):\n")
print("Original Data:")
print(X[:5, :3])
print("\nStandardized Data:")
print(X_standard[:5, :3])
print("\nFeature-wise Mean and Std after standardization:")
for i in range(3):
    print(f"Feature {i+1}: Mean={X_standard[:, i].mean():.6f}, Std={X_standard[:, i].std():.6f}")

### 3.3: Side-by-Side Method Comparison

In [None]:
# Compare the two scaling methods visually
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Original Data
axes[0].hist(X[:, 0], bins=30, color='blue', alpha=0.7, edgecolor='black')
axes[0].set_title('Original Data\n(Feature 1: Mean Radius)', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Frequency')
axes[0].grid(alpha=0.3)

# Min-Max Normalized
axes[1].hist(X_minmax[:, 0], bins=30, color='green', alpha=0.7, edgecolor='black')
axes[1].set_title('Min-Max Normalized\n(Range: [0, 1])', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Value')
axes[1].set_ylabel('Frequency')
axes[1].grid(alpha=0.3)

# Standardized
axes[2].hist(X_standard[:, 0], bins=30, color='red', alpha=0.7, edgecolor='black')
axes[2].set_title('Standardized\n(Mean≈0, Std≈1)', fontsize=12, fontweight='bold')
axes[2].set_xlabel('Value')
axes[2].set_ylabel('Frequency')
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('scaling_comparison.png', dpi=100, bbox_inches='tight')
plt.show()

print("Scaling methods comparison plotted!")

In [None]:
# Summary statistics comparison
comparison_df = pd.DataFrame({
    'Original': [
        X[:, 0].min(), X[:, 0].max(), X[:, 0].mean(), X[:, 0].std()
    ],
    'Min-Max Normalized': [
        X_minmax[:, 0].min(), X_minmax[:, 0].max(), X_minmax[:, 0].mean(), X_minmax[:, 0].std()
    ],
    'Standardized': [
        X_standard[:, 0].min(), X_standard[:, 0].max(), X_standard[:, 0].mean(), X_standard[:, 0].std()
    ]
}, index=['Min', 'Max', 'Mean', 'Std Dev'])

print("\nStatistics Comparison (Feature 1: Mean Radius):")
print(comparison_df.round(4))

---

## A4: Additional Analysis - Class Composition and Feature Relationships (Bonus)

### 4.1: Examining Class Balance

In [None]:
# Plot target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Count plot
classes = ['Benign (0)', 'Malignant (1)']
counts = np.bincount(y)
colors = ['#2ecc71', '#e74c3c']

axes[0].bar(classes, counts, color=colors, alpha=0.7, edgecolor='black', linewidth=2)
axes[0].set_title('Class Distribution', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Number of Samples')
axes[0].grid(axis='y', alpha=0.3)
for i, count in enumerate(counts):
    axes[0].text(i, count + 5, str(count), ha='center', fontweight='bold')

# Pie chart
axes[1].pie(counts, labels=classes, colors=colors, autopct='%1.1f%%', startangle=90,
            textprops={'fontsize': 11, 'fontweight': 'bold'})
axes[1].set_title('Class Balance Percentage', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('class_distribution.png', dpi=100, bbox_inches='tight')
plt.show()

print("Class distribution visualization completed!")

### 4.2: Looking at Individual Feature Patterns

In [None]:
# Plot distributions of first 12 features
fig, axes = plt.subplots(3, 4, figsize=(16, 10))
axes = axes.ravel()

for i in range(12):
    axes[i].hist(X[:, i], bins=30, color='steelblue', alpha=0.7, edgecolor='black')
    axes[i].set_title(f'{feature_names[i]}', fontsize=10, fontweight='bold')
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Frequency')
    axes[i].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('feature_distributions.png', dpi=100, bbox_inches='tight')
plt.show()

print("Feature distributions plotted!")

### 4.3: Relationships Between Attributes and Outcome

In [None]:
# Create correlation matrix with target variable
df_with_target = np.column_stack([X, y])
column_names = feature_names + ['target']
df_corr = pd.DataFrame(df_with_target, columns=column_names)

# Calculate correlation with target
correlations_with_target = df_corr.corr()['target'].drop('target').sort_values(ascending=False)

print("Top 10 Features Most Correlated with Target (Malignancy):")
print(correlations_with_target.head(10))
print("\nBottom 10 Features Least Correlated with Target:")
print(correlations_with_target.tail(10))

In [None]:
# Plot correlation with target
fig, ax = plt.subplots(figsize=(12, 8))

colors = ['red' if x < 0 else 'green' for x in correlations_with_target]
ax.barh(range(len(correlations_with_target)), correlations_with_target.values, color=colors, alpha=0.7, edgecolor='black')
ax.set_yticks(range(len(correlations_with_target)))
ax.set_yticklabels(correlations_with_target.index, fontsize=9)
ax.set_xlabel('Correlation Coefficient', fontweight='bold')
ax.set_title('Feature Correlations with Target (Malignancy)', fontsize=12, fontweight='bold')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('feature_correlations.png', dpi=100, bbox_inches='tight')
plt.show()

print("Feature correlations plotted!")

In [None]:
# Heatmap of correlations between first 15 features
fig, ax = plt.subplots(figsize=(12, 10))

corr_matrix = df_corr.iloc[:, :15].corr()
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, ax=ax)
ax.set_title('Correlation Matrix - First 15 Features', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=100, bbox_inches='tight')
plt.show()

print("Correlation heatmap plotted!")

### 4.4: Class-Stratified Comparisons

In [None]:
# Box plots for top 6 features by correlation
top_features_indices = correlations_with_target.abs().nlargest(6).index
top_features_names = [feature_names[feature_names.index(f)] if f in feature_names else f 
                      for f in top_features_indices]

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.ravel()

for idx, feature in enumerate(top_features_indices[:6]):
    feature_idx = feature_names.index(feature)
    
    benign_data = X[y == 0, feature_idx]
    malignant_data = X[y == 1, feature_idx]
    
    axes[idx].boxplot([benign_data, malignant_data], labels=['Benign', 'Malignant'],
                       patch_artist=True, boxprops=dict(facecolor='lightblue'),
                       medianprops=dict(color='red', linewidth=2))
    axes[idx].set_title(feature, fontsize=11, fontweight='bold')
    axes[idx].set_ylabel('Value')
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('class_distributions.png', dpi=100, bbox_inches='tight')
plt.show()

print("Class-wise distributions plotted!")

---

## Completion of Phase A: Data Readiness

The data exploration and preparation stage is finished successfully.

In [None]:
# Final summary
print("="*70)
print("PART A SUMMARY: DATA EXPLORATION & PREPROCESSING")
print("="*70)

print("\n✓ DATA LOADING:")
print(f"  - Dataset shape: {df.shape}")
print(f"  - Features: {X.shape[1]}")
print(f"  - Samples: {X.shape[0]}")

print("\n✓ DATA CLEANING:")
print(f"  - Missing values: {df_cleaned.isnull().sum().sum()}")
print(f"  - ID columns dropped: Yes")
print(f"  - Final clean samples: {len(y)}")

print("\n✓ TARGET DISTRIBUTION:")
print(f"  - Benign (0): {(y==0).sum()} samples ({(y==0).sum()/len(y)*100:.1f}%)")
print(f"  - Malignant (1): {(y==1).sum()} samples ({(y==1).sum()/len(y)*100:.1f}%)")

print("\n✓ FEATURE SCALING IMPLEMENTED:")
print(f"  - Min-Max Normalization: ✓")
print(f"    * Range: [0, 1]")
print(f"    * Status: X_minmax array created (shape: {X_minmax.shape})")
print(f"\n  - Standardization (Z-score): ✓")
print(f"    * Mean ≈ 0, Std ≈ 1")
print(f"    * Status: X_standard array created (shape: {X_standard.shape})")

print("\n✓ EXPLORATORY DATA ANALYSIS (BONUS):")
print(f"  - Class balance visualization: ✓")
print(f"  - Feature distributions: ✓")
print(f"  - Feature correlations: ✓")
print(f"  - Class-wise comparisons: ✓")

print("\n" + "="*70)
print("READY FOR PART B: THE MATHEMATICS OF LOGISTIC REGRESSION")
print("="*70)

---

## Important Observations from Data Preparation

### 1. Data Quality Assessment
- Wisconsin Breast Cancer dataset has complete records (no gaps)
- Class composition: roughly 63% Benign, 37% Malignant  

### 2. Feature Value Ranges
- Significant variation exists in feature magnitudes (area: 0-2500 vs smoothness: 0-0.3)
- This heterogeneity makes rescaling necessary prior to optimization algorithms

### 3. Attribute-Outcome Correlations
- Strong associations between specific measurements and disease classification
- Features like "worst radius" and "worst area" are particularly predictive
- These attributes will drive the predictive model performance

### 4. Scaling Strategy Selection
- **Recommendation for this work**: Use **Standardization (Z-score)** because:
  - More natural fit with Gradient Descent convergence behavior
  - Provides superior numerical stability
  - Easier interpretation in biological context (standard deviations from population mean)

---

## Proceeding to Phase B

Next, we'll create the logistic regression algorithm components:
1. Sigmoid activation function
2. Prediction function
3. Error measure (Binary Cross Entropy)
4. Gradient computation
5. Iterative optimization
6. Training dynamics visualization