# Breast Cancer Prediction Model Development
This notebook develops a machine learning model to predict whether a tumor is benign or malignant.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')

## 2. Load the Breast Cancer Wisconsin Dataset

In [None]:
# Load dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['diagnosis'] = data.target

# Display basic information
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
df.head()

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
print("\nTotal missing values:", df.isnull().sum().sum())

In [None]:
# Display target distribution
print("Target Distribution:")
print("0 (Malignant):", (df['diagnosis'] == 0).sum())
print("1 (Benign):", (df['diagnosis'] == 1).sum())

## 3. Feature Selection
Select 5 features from the specified list

In [None]:
# Select 5 features from the specified list
selected_features = ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness']

# Create feature matrix and target vector
X = df[selected_features]
y = df['diagnosis']

print("Selected features:", selected_features)
print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)

## 4. Data Preprocessing

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])

In [None]:
# Scale features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully")
print("Mean of scaled features (should be ~0):", np.mean(X_train_scaled, axis=0))
print("Std of scaled features (should be ~1):", np.std(X_train_scaled, axis=0))

## 5. Model Training

In [None]:
# Train Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=10000)
model.fit(X_train_scaled, y_train)

print("Model trained successfully")

## 6. Model Evaluation

In [None]:
# Make predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Calculate metrics for training set
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

# Calculate metrics for testing set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print("=" * 50)
print("TRAINING SET PERFORMANCE")
print("=" * 50)
print(f"Accuracy:  {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall:    {train_recall:.4f}")
print(f"F1-Score:  {train_f1:.4f}")

print("\n" + "=" * 50)
print("TESTING SET PERFORMANCE")
print("=" * 50)
print(f"Accuracy:  {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall:    {test_recall:.4f}")
print(f"F1-Score:  {test_f1:.4f}")

In [None]:
# Detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=['Malignant', 'Benign']))

In [None]:
# Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_test_pred)
print(cm)
print("\nInterpretation:")
print(f"True Negatives (Malignant correctly predicted): {cm[0][0]}")
print(f"False Positives (Malignant predicted as Benign): {cm[0][1]}")
print(f"False Negatives (Benign predicted as Malignant): {cm[1][0]}")
print(f"True Positives (Benign correctly predicted): {cm[1][1]}")

## 7. Save the Model

In [None]:
# Save the trained model and scaler
model_data = {
    'model': model,
    'scaler': scaler,
    'features': selected_features
}

joblib.dump(model_data, 'breast_cancer_model.pkl')
print("Model saved successfully as 'breast_cancer_model.pkl'")

## 8. Demonstrate Model Reloading and Prediction

In [None]:
# Reload the model
loaded_model_data = joblib.load('breast_cancer_model.pkl')
loaded_model = loaded_model_data['model']
loaded_scaler = loaded_model_data['scaler']
loaded_features = loaded_model_data['features']

print("Model reloaded successfully")
print("Features used:", loaded_features)

In [None]:
# Test prediction with sample data
sample_data = X_test.iloc[0:5]  # Take first 5 samples from test set

print("\nSample Input Data:")
print(sample_data)

# Scale the sample data
sample_data_scaled = loaded_scaler.transform(sample_data)

# Make predictions
predictions = loaded_model.predict(sample_data_scaled)
prediction_proba = loaded_model.predict_proba(sample_data_scaled)

print("\nPredictions:")
for i, pred in enumerate(predictions):
    result = "Benign" if pred == 1 else "Malignant"
    confidence = prediction_proba[i][pred] * 100
    actual = "Benign" if y_test.iloc[i] == 1 else "Malignant"
    print(f"Sample {i+1}: Predicted = {result} (Confidence: {confidence:.2f}%), Actual = {actual}")

## Summary

**Algorithm Used:** Logistic Regression

**Selected Features:**
- mean radius
- mean texture
- mean perimeter
- mean area
- mean smoothness

**Model Persistence:** Joblib

The model has been successfully trained, evaluated, and saved. It can be reloaded and used for making predictions on new data.