## 1. Import Libraries

In [None]:
# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn modules
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)

# Model persistence
import joblib
import json

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported successfully!")

## 2. Load and Explore Dataset

In [None]:
# Load the Wisconsin Breast Cancer Dataset
data = load_breast_cancer()

# Create DataFrame
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

print("üìä Dataset Shape:", df.shape)
print("\nüìã Feature Names:")
for i, name in enumerate(data.feature_names):
    print(f"  {i+1}. {name}")

print("\nüéØ Target Classes:")
print(f"  0 = {data.target_names[0]} (Malignant)")
print(f"  1 = {data.target_names[1]} (Benign)")

In [None]:
# Display first few rows
print("\nüìù First 5 rows of the dataset:")
df.head()

In [None]:
# Dataset statistics
print("üìà Dataset Statistics:")
df.describe()

In [None]:
# Check for missing values
print("\nüîç Missing Values:")
print(df.isnull().sum().sum(), "missing values found")

# Target distribution
print("\nüìä Target Distribution:")
print(df['target'].value_counts())
print(f"\nBenign: {(df['target'] == 1).sum()} ({(df['target'] == 1).mean()*100:.1f}%)")
print(f"Malignant: {(df['target'] == 0).sum()} ({(df['target'] == 0).mean()*100:.1f}%)")

## 3. Data Visualization

In [None]:
# Target distribution plot
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Pie chart
colors = ['#ff6b6b', '#4ecdc4']
axes[0].pie(df['target'].value_counts(), labels=['Benign', 'Malignant'], 
            autopct='%1.1f%%', colors=colors, explode=(0.05, 0))
axes[0].set_title('Target Distribution', fontsize=14, fontweight='bold')

# Bar chart
df['target'].value_counts().plot(kind='bar', ax=axes[1], color=colors)
axes[1].set_title('Target Class Count', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Class')
axes[1].set_ylabel('Count')
axes[1].set_xticklabels(['Benign (1)', 'Malignant (0)'], rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap for mean features
mean_features = [col for col in df.columns if 'mean' in col]
plt.figure(figsize=(12, 10))
correlation_matrix = df[mean_features + ['target']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='RdYlBu_r', center=0, 
            fmt='.2f', square=True, linewidths=0.5)
plt.title('Correlation Heatmap (Mean Features)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Feature distributions by target class
fig, axes = plt.subplots(2, 5, figsize=(20, 8))
axes = axes.flatten()

for idx, feature in enumerate(mean_features):
    for target_val, color, label in [(0, '#ff6b6b', 'Malignant'), (1, '#4ecdc4', 'Benign')]:
        axes[idx].hist(df[df['target'] == target_val][feature], 
                      alpha=0.6, bins=20, color=color, label=label)
    axes[idx].set_title(feature.replace('_', ' ').title(), fontsize=10)
    axes[idx].legend(fontsize=8)

plt.suptitle('Feature Distributions by Target Class', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

## 4. Data Preprocessing

In [None]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"üìä Training set size: {X_train.shape[0]} samples")
print(f"üìä Testing set size: {X_test.shape[0]} samples")

In [None]:
# Feature scaling using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Feature scaling completed!")
print(f"\nüìà Scaled feature statistics:")
print(f"  Mean: {X_train_scaled.mean():.6f} (should be ~0)")
print(f"  Std:  {X_train_scaled.std():.6f} (should be ~1)")

## 5. Model Training & Comparison

In [None]:
# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(probability=True, random_state=42)
}

# Train and evaluate each model
results = {}

print("üîÑ Training and evaluating models...\n")
print("=" * 70)

for name, model in models.items():
    # Train model
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }
    
    print(f"\nüìä {name}")
    print("-" * 40)
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1 Score:  {f1:.4f}")
    print(f"  AUC-ROC:   {auc:.4f}")
    print(f"  CV Score:  {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

print("\n" + "=" * 70)

In [None]:
# Compare models visually
metrics_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [r['accuracy'] for r in results.values()],
    'Precision': [r['precision'] for r in results.values()],
    'Recall': [r['recall'] for r in results.values()],
    'F1 Score': [r['f1'] for r in results.values()],
    'AUC-ROC': [r['auc'] for r in results.values()]
})

# Plot comparison
fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(metrics_df['Model']))
width = 0.15

colors = ['#3498db', '#2ecc71', '#f39c12', '#e74c3c', '#9b59b6']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC-ROC']

for i, (metric, color) in enumerate(zip(metrics, colors)):
    ax.bar(x + i * width, metrics_df[metric], width, label=metric, color=color)

ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x + width * 2)
ax.set_xticklabels(metrics_df['Model'], rotation=15)
ax.legend(loc='lower right')
ax.set_ylim(0.8, 1.02)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# ROC Curves for all models
plt.figure(figsize=(10, 8))

colors = ['#3498db', '#2ecc71', '#f39c12', '#e74c3c']

for (name, result), color in zip(results.items(), colors):
    model = result['model']
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.plot(fpr, tpr, color=color, lw=2, 
             label=f'{name} (AUC = {result["auc"]:.4f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier')
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves Comparison', fontsize=14, fontweight='bold')
plt.legend(loc='lower right')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Select Best Model & Hyperparameter Tuning

In [None]:
# Select best model based on F1 score
best_model_name = max(results, key=lambda x: results[x]['f1'])
print(f"üèÜ Best performing model: {best_model_name}")
print(f"   F1 Score: {results[best_model_name]['f1']:.4f}")

In [None]:
# Use the best performing model (Logistic Regression)
# No hyperparameter tuning needed - LR works great with default params on this dataset
print("üèÜ Using Logistic Regression as the final model (best performer)")

final_model = LogisticRegression(max_iter=1000, random_state=42)
final_model.fit(X_train_scaled, y_train)

print("‚úÖ Model trained successfully!")

In [None]:
# Train final model with best parameters
final_model = grid_search.best_estimator_

# Final evaluation
y_pred_final = final_model.predict(X_test_scaled)
y_pred_proba_final = final_model.predict_proba(X_test_scaled)[:, 1]

print("\nüìä Final Model Performance on Test Set:")
print("=" * 50)
print(f"  Accuracy:  {accuracy_score(y_test, y_pred_final):.4f}")
print(f"  Precision: {precision_score(y_test, y_pred_final):.4f}")
print(f"  Recall:    {recall_score(y_test, y_pred_final):.4f}")
print(f"  F1 Score:  {f1_score(y_test, y_pred_final):.4f}")
print(f"  AUC-ROC:   {roc_auc_score(y_test, y_pred_proba_final):.4f}")

In [None]:
# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred_final)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Malignant', 'Benign'],
            yticklabels=['Malignant', 'Benign'])
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('Actual', fontsize=12)
plt.title('Confusion Matrix - Final Model', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nüìã Classification Report:")
print(classification_report(y_test, y_pred_final, target_names=['Malignant', 'Benign']))

## 7. Feature Importance Analysis

In [None]:
# Get feature importances (using model coefficients for Logistic Regression)
feature_importance = pd.DataFrame({
    'feature': data.feature_names,
    'importance': np.abs(final_model.coef_[0])  # Use absolute coefficients
}).sort_values('importance', ascending=False)

# Plot top 15 features
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(15)
colors = plt.cm.RdYlGn(np.linspace(0.2, 0.8, len(top_features)))
plt.barh(range(len(top_features)), top_features['importance'], color=colors)
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Coefficient Magnitude', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top 15 Feature Importances (Logistic Regression)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 8. Save Model and Artifacts

In [None]:
# Calculate dataset statistics for feature mapping
dataset_stats = {
    'feature_names': list(data.feature_names),
    'feature_means': X.mean().to_dict(),
    'feature_stds': X.std().to_dict(),
    'feature_mins': X.min().to_dict(),
    'feature_maxs': X.max().to_dict(),
    'scaler_mean': scaler.mean_.tolist(),
    'scaler_scale': scaler.scale_.tolist()
}

print("üìä Dataset Statistics Calculated")
print(f"   Number of features: {len(dataset_stats['feature_names'])}")

In [None]:
# Save model using joblib
joblib.dump(final_model, 'breast_cancer_model.joblib')
print("‚úÖ Model saved: breast_cancer_model.joblib")

# Save scaler
joblib.dump(scaler, 'scaler.joblib')
print("‚úÖ Scaler saved: scaler.joblib")

# Save dataset statistics as JSON
with open('dataset_stats.json', 'w') as f:
    json.dump(dataset_stats, f, indent=2)
print("‚úÖ Dataset statistics saved: dataset_stats.json")

In [None]:
# Verify saved model
print("\nüîç Verifying saved model...")

# Load and test
loaded_model = joblib.load('breast_cancer_model.joblib')
loaded_scaler = joblib.load('scaler.joblib')

# Test prediction
test_sample = X_test.iloc[0:1]
test_sample_scaled = loaded_scaler.transform(test_sample)
prediction = loaded_model.predict(test_sample_scaled)
probability = loaded_model.predict_proba(test_sample_scaled)

print(f"\nüìù Test Prediction:")
print(f"   Prediction: {'Benign' if prediction[0] == 1 else 'Malignant'}")
print(f"   Confidence: {max(probability[0]) * 100:.2f}%")
print(f"   Actual: {'Benign' if y_test.iloc[0] == 1 else 'Malignant'}")

print("\n‚úÖ Model verification successful!")

## 9. Download Model Files

Run the cell below to download the model files to your local machine.

In [None]:
# Download files (for Google Colab)
try:
    from google.colab import files
    
    print("üì• Downloading model files...")
    files.download('breast_cancer_model.joblib')
    files.download('scaler.joblib')
    files.download('dataset_stats.json')
    print("\n‚úÖ All files downloaded successfully!")
    print("\nüìÅ Place these files in your backend/model/ directory")
except ImportError:
    print("‚ö†Ô∏è Not running in Google Colab.")
    print("üìÅ Files are saved in the current directory:")
    print("   - breast_cancer_model.joblib")
    print("   - scaler.joblib")
    print("   - dataset_stats.json")

## 10. Model Summary

In [None]:
print("="*60)
print("           ü©∫ BREAST CANCER DETECTION MODEL SUMMARY")
print("="*60)
print(f"\nüìä Dataset: Wisconsin Breast Cancer Dataset")
print(f"   - Total samples: {len(df)}")
print(f"   - Features: {len(data.feature_names)}")
print(f"   - Classes: Benign (1), Malignant (0)")
print(f"\nü§ñ Model: Logistic Regression")
print(f"\nüìà Performance Metrics:")
print(f"   - Accuracy:  {accuracy_score(y_test, y_pred_final):.4f}")
print(f"   - Precision: {precision_score(y_test, y_pred_final):.4f}")
print(f"   - Recall:    {recall_score(y_test, y_pred_final):.4f}")
print(f"   - F1 Score:  {f1_score(y_test, y_pred_final):.4f}")
print(f"   - AUC-ROC:   {roc_auc_score(y_test, y_pred_proba_final):.4f}")
print(f"\nüíæ Saved Files:")
print(f"   - breast_cancer_model.joblib (trained model)")
print(f"   - scaler.joblib (feature scaler)")
print(f"   - dataset_stats.json (feature statistics)")
print("\n" + "="*60)
print("‚úÖ Training complete! Model ready for deployment.")
print("="*60)