In [2]:
# ==========================================
# END-TO-END ML PIPELINE (CUSTOM DATASET)
# ==========================================

import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# ---------------------------
# 1. Load Dataset
# ---------------------------
df = pd.read_csv("/content/breast-cancer.csv")

print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())

# ---------------------------
# 2. Detect Target Column
# ---------------------------
possible_targets = ["target","label","diagnosis","class","output"]

target_col = None
for col in possible_targets:
    if col in df.columns:
        target_col = col
        break

# if not found â†’ assume last column is target
if target_col is None:
    target_col = df.columns[-1]

print("Target Column Detected:", target_col)

# ---------------------------
# 3. Split Features & Target
# ---------------------------
X = df.drop(columns=[target_col])
y = df[target_col]

# ---------------------------
# 4. Detect Column Types
# ---------------------------
num_features = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object","category","bool"]).columns.tolist()

print("Numerical:", num_features)
print("Categorical:", cat_features)

# ---------------------------
# 5. Preprocessing Pipelines
# ---------------------------
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

# ---------------------------
# 6. Full Pipeline
# ---------------------------
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

# ---------------------------
# 7. Train Test Split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------
# 8. Train Model
# ---------------------------
pipeline.fit(X_train, y_train)

# ---------------------------
# 9. Predictions
# ---------------------------
y_pred = pipeline.predict(X_test)

# ---------------------------
# 10. Evaluation
# ---------------------------
print("\nMODEL PERFORMANCE")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="weighted"))
print("Recall   :", recall_score(y_test, y_pred, average="weighted"))
print("F1 Score :", f1_score(y_test, y_pred, average="weighted"))

print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ---------------------------
# 11. Save Model
# ---------------------------
with open("trained_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("\nPipeline saved as trained_pipeline.pkl")

Dataset Shape: (569, 32)
Columns: ['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
Target Column Detected: diagnosis
Numerical: ['id', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se',