In [None]:
# Step 1 ‚Äì Install Required Libraries
# Purpose:
# Install essential ML packages to handle data, train the model, and build a Streamlit app.
# We'll use:
# - scikit-learn for ML algorithms
# - pandas for handling tabular data
# - joblib for saving and loading the trained model
# - streamlit for building an interactive app
# - pyngrok for hosting the app publicly (optional)

# Install required Python packages quietly (no verbose output).
!pip install scikit-learn pandas joblib streamlit pyngrok --quiet


In [None]:
# Step 2 ‚Äì Import Libraries and Load Dataset
# Purpose:
# Import necessary Python libraries and load the built-in Breast Cancer dataset from scikit-learn.
# We'll convert it into a pandas DataFrame and explore basic information.

# Import pandas for DataFrame operations (tabular data handling)
import pandas as pd

# Import the built-in Breast Cancer dataset from scikit-learn
from sklearn.datasets import load_breast_cancer

# Load the dataset into memory
data = load_breast_cancer()

# Create a DataFrame 'df' containing all input features (independent variables)
df = pd.DataFrame(data.data, columns=data.feature_names)

# Add a new column 'target' which contains output labels (0 = malignant, 1 = benign)
df['target'] = data.target

# Print the dataset's shape (number of rows and columns)
print("Dataset Shape:", df.shape)

# Display the first few rows of the dataset to get an overview
df.head()


In [None]:
# Step 3 ‚Äì Exploratory Data Analysis (EDA)
# Purpose:
# Understand the dataset by viewing statistical information, class distribution, and checking for missing values.

# Show basic statistical summary (mean, std, min, max) for each column
print(df.describe())

# Check for missing values in each column (should be zero for clean datasets)
print("\nMissing Values:\n", df.isnull().sum())

# Check class balance (number of benign vs malignant cases)
print("\nClass Distribution (0 = Malignant, 1 = Benign):")
print(df['target'].value_counts())


In [None]:
# Step 4 ‚Äì Select the same 5 features used in the Streamlit app, then split data for training/testing.
# Purpose:
# Keep the dataset consistent with app inputs to avoid "missing feature" errors later.

# Define the feature names that match the Streamlit input fields.
selected_features = ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness']

# Create the feature matrix X with only these 5 columns.
X = df[selected_features]

# Create the target vector y from the 'target' column.
y = df['target']

# Import the function to split data into train/test parts.
from sklearn.model_selection import train_test_split

# Split X and y into training (80%) and testing (20%) sets.
# random_state ensures reproducibility.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Confirm shapes of the split data.
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)



In [None]:
# Step 5 ‚Äì Standardize numeric features.
# Purpose:
# Scale all features so they have mean 0 and standard deviation 1, which helps the model converge faster.

# Import StandardScaler for normalization.
from sklearn.preprocessing import StandardScaler

# Initialize the scaler.
scaler = StandardScaler()

# Fit the scaler on training data and transform it.
X_train_scaled = scaler.fit_transform(X_train)

# Apply the same transformation to test data.
X_test_scaled = scaler.transform(X_test)

# Print confirmation message.
print("‚úÖ Feature scaling completed.")



In [None]:
# Step 6 ‚Äì Train and Evaluate Model
# Purpose:
# Fit a Logistic Regression model to the scaled training data and evaluate its performance.

# Import LogisticRegression from scikit-learn.
from sklearn.linear_model import LogisticRegression

# Initialize the model with higher iteration limit to ensure convergence.
model = LogisticRegression(max_iter=500, random_state=42)

# Train the model on the scaled training data.
model.fit(X_train_scaled, y_train)

# Use the model to predict labels for the test set.
y_pred = model.predict(X_test_scaled)

# Import accuracy metric to check performance.
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy.
accuracy = accuracy_score(y_test, y_pred)

# Print performance results.
print("‚úÖ Model training completed successfully!")
print("Accuracy on test data:", round(accuracy, 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
# Step 7 ‚Äì Save the trained model and scaler.
# Purpose:
# Store the Logistic Regression model and the fitted scaler for use in the Streamlit app.

# Import joblib for saving Python objects.
import joblib

# Save the trained model to a file.
joblib.dump(model, "breast_cancer_model.pkl")

# Save the fitted scaler to a file.
joblib.dump(scaler, "scaler.pkl")

# Print confirmation.
print("Model and scaler saved successfully!")
print("Model file: breast_cancer_model.pkl")
print("Scaler file: scaler.pkl")



In [None]:
# Step 8 ‚Äì Build Streamlit App for Prediction
# Purpose:
# Create an interactive app where users can enter input features and get a prediction of cancer type.
# Write everything into a file named app.py
%%writefile app.py

# Import Streamlit for building the web app UI
import streamlit as st

# Import numpy for numeric array handling
import numpy as np

# Import pickle for loading model and scaler files
import pickle

# Import dataset loader to access the breast cancer dataset
from sklearn.datasets import load_breast_cancer

# Import StandardScaler for feature normalization
from sklearn.preprocessing import StandardScaler

# Import LogisticRegression for classification
from sklearn.linear_model import LogisticRegression


# --------------------------
# TRAIN MODEL (only runs first time)
# --------------------------

# Load Breast Cancer dataset (features + target labels)
data = load_breast_cancer()

# Split data into X (features) and y (target labels)
X, y = data.data, data.target

# Create and fit StandardScaler to normalize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize and train Logistic Regression model
model = LogisticRegression(max_iter=10000)
model.fit(X_scaled, y)

# Save trained model
with open("breast_cancer_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save fitted scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)


# --------------------------
#  LOAD MODEL AND SCALER
# --------------------------

# Load saved model
with open("breast_cancer_model.pkl", "rb") as f:
    model = pickle.load(f)

# Load saved scaler
with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)


# --------------------------
# Streamlit UI
# --------------------------

# App title
st.title(" Breast Cancer Prediction App")

# Short intro text
st.write("This app predicts whether a breast tumor is **Benign (1)** or **Malignant (0)** based on cell measurements.")
st.write("Enter your values below you‚Äôll also see which range is considered **safe** or **risky** based on medical data patterns.")

# Add a section header for inputs
st.markdown("###  Enter The Values:")


# --------------------------
# Helper function to check safe ranges
# --------------------------

# Function that shows whether a given feature is within or outside its safe range
def check_range(value, safe_min, safe_max, feature_name):
    # If below or above safe range, show risky message
    if value < safe_min or value > safe_max:
        st.markdown(f" **{feature_name}** is outside safe range ({safe_min}‚Äì{safe_max})üî¥ *Risky*")
    # Otherwise mark as safe
    else:
        st.markdown(f" **{feature_name}** is within safe range ({safe_min}‚Äì{safe_max})üü¢ *Safe*")


# --------------------------
# User Inputs (10 key features)
# --------------------------

# Numeric inputs for key cancer measurement features
mean_radius = st.number_input("Mean Radius", 5.0, 30.0, 14.0)
check_range(mean_radius, 10, 15, "Mean Radius")

mean_texture = st.number_input("Mean Texture", 5.0, 40.0, 20.0)
check_range(mean_texture, 10, 22, "Mean Texture")

mean_perimeter = st.number_input("Mean Perimeter", 40.0, 200.0, 90.0)
check_range(mean_perimeter, 60, 110, "Mean Perimeter")

mean_area = st.number_input("Mean Area", 100.0, 2500.0, 600.0)
check_range(mean_area, 400, 800, "Mean Area")

mean_smoothness = st.number_input("Mean Smoothness", 0.05, 0.2, 0.10)
check_range(mean_smoothness, 0.07, 0.10, "Mean Smoothness")

mean_compactness = st.number_input("Mean Compactness", 0.0, 1.0, 0.15)
check_range(mean_compactness, 0.05, 0.18, "Mean Compactness")

mean_concavity = st.number_input("Mean Concavity", 0.0, 1.0, 0.20)
check_range(mean_concavity, 0.00, 0.18, "Mean Concavity")

mean_concave_points = st.number_input("Mean Concave Points", 0.0, 0.5, 0.10)
check_range(mean_concave_points, 0.00, 0.10, "Mean Concave Points")

mean_symmetry = st.number_input("Mean Symmetry", 0.1, 0.4, 0.18)
check_range(mean_symmetry, 0.14, 0.22, "Mean Symmetry")

mean_fractal_dimension = st.number_input("Mean Fractal Dimension", 0.02, 0.1, 0.06)
check_range(mean_fractal_dimension, 0.05, 0.07, "Mean Fractal Dimension")


# --------------------------
# Prepare full 30-feature input
# --------------------------

# Initialize an empty feature array (1 sample √ó 30 features)
input_data = np.zeros((1, 30))

# Store 10 entered features in the first 10 columns
user_features = [
    mean_radius, mean_texture, mean_perimeter, mean_area, mean_smoothness,
    mean_compactness, mean_concavity, mean_concave_points, mean_symmetry, mean_fractal_dimension
]

# Assign user-entered values to the input vector
input_data[0, :10] = user_features

# Fill remaining 20 features with dataset mean values
input_data[0, 10:] = np.mean(data.data[:, 10:], axis=0)

# Scale input using the saved StandardScaler
scaled_data = scaler.transform(input_data)


# --------------------------
# Predict Button and Output
# --------------------------

# When Predict button is clicked
if st.button("üîç Predict"):
    # Get prediction probabilities for both classes (Benign, Malignant)
    prob = model.predict_proba(scaled_data)[0]
    # Predict the most likely class (0 or 1)
    prediction = model.predict(scaled_data)[0]
    # Get maximum probability as confidence score
    confidence = round(float(max(prob)) * 100, 2)

    # Show result: if class = 1 ‚Üí Benign
    if prediction == 1:
        st.success(f"üü© Prediction: **Benign (Non-cancerous)** ‚Äî Confidence: {confidence}%")
        st.write("""
        ### Meaning:
        - Tumor cells look **normal and non-invasive**.
        - Usually **not life-threatening**, but follow-up tests are a good practice.
        - The cell size and texture fall mostly within safe biological ranges.
        """)
    # Else class = 0 ‚Üí Malignant
    else:
        st.error(f"üü• Prediction: **Malignant (Cancerous)** ‚Äî Confidence: {confidence}%")
        st.write("""
        ### Meaning:
        - Tumor cells appear **abnormal, larger, and irregular**.
        - Indicates **possible cancerous growth**.
        - Please consult a doctor for **biopsy and further medical evaluation**.
        """)

    # --------------------------
    # Confidence-level interpretation
    # --------------------------

    # Add subheader for interpretation
    st.subheader("Confidence Level Analysis:")

    # Create a color-coded progress bar representing confidence
    st.progress(int(confidence))

    # Categorize and explain model confidence
    if confidence < 60:
        st.info("üü¢ **Low Risk:** Model is not strongly confident of malignancy. Likely benign or safe zone.")
    elif 60 <= confidence < 80:
        st.warning("üü† **Medium Risk:** Some patterns slightly match malignant cases ‚Äî borderline, uncertain.")
    else:
        st.error("üî¥ **High Risk:** Strong malignant pattern detected. Please seek medical consultation immediately.")

    # Divider for readability
    st.markdown("---")

    # --------------------------
    #  Educational section
    # --------------------------
    st.markdown("""
    ### How this AI works:
    - Dataset: **Breast Cancer Wisconsin Dataset** (569 samples, 30 features)
    - Algorithm: **Logistic Regression**
    - The model analyzes statistical measurements like cell size, texture, compactness, etc.
    - Output: probability (0‚Äì100%) of being **malignant or benign**.
    """)

# Add a horizontal line and disclaimer
st.write("---")
st.caption("‚ö†Ô∏è Note: This tool is for **educational purposes only** and not a substitute for medical diagnosis.")







In [None]:
# Cell purpose:
# Start a Streamlit app and open it publicly with ngrok.
# This version removes deprecated 'options' fields and always generates a random public URL.

from pyngrok import ngrok
import time, os

# Your ngrok authtoken
NGROK_AUTHTOKEN = "3536tjxZzkcNAbMjjVIegi0rOA9_4QgBc7uxDJWnwaxFDtsnq"

# Authenticate ngrok session
ngrok.set_auth_token(NGROK_AUTHTOKEN)

# Stop any existing tunnels and Streamlit processes
ngrok.kill()
os.system("pkill streamlit >/dev/null 2>&1")
os.system("fuser -k 8501/tcp >/dev/null 2>&1")

# Start Streamlit on port 8501
os.system("streamlit run app.py --server.port 8501 &")

# Give Streamlit time to boot
time.sleep(8)

# Create a new public tunnel to port 8501 (random URL each run)
public_url = ngrok.connect(8501)

#  Show the link
print("‚úÖ Your Streamlit app is live at:")
print(public_url.public_url)



| Feature             | Description                                             | Higher Value Means                         |
| ------------------- | ------------------------------------------------------- | ------------------------------------------ |
| **Mean Radius**     | Average size of the cell nuclei in the tumor            | Larger radius ‚Üí possibly cancerous         |
| **Mean Texture**    | Variation in the gray-scale texture (cell irregularity) | Higher ‚Üí more irregular cells              |
| **Mean Perimeter**  | Total distance around the tumor cell cluster            | Higher ‚Üí cells are bigger/less uniform     |
| **Mean Area**       | Total cell area                                         | Larger ‚Üí more abnormal cell growth         |
| **Mean Smoothness** | Variation in radius lengths (surface irregularity)      | Higher ‚Üí less smooth, more irregular cells |
