In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

# Load the dataset from sklearn
data = load_breast_cancer()

# Convert to DataFrame
df = pd.DataFrame(data.data, columns=data.feature_names)

# Add the diagnosis (target) column
# sklearn provides target as 0 and 1 (0=Malignant, 1=Benign in sklearn's version)
df['diagnosis'] = data.target

# Mapping to match your project requirements (M/B)
# Note: sklearn uses 0 for malignant, but we will map it to strings to follow your prompt
df['diagnosis'] = df['diagnosis'].map({0: 'Malignant', 1: 'Benign'})

# Rename columns to match the "mean" naming convention in your prompt
df.columns = [col.replace(' ', '_') for col in df.columns]

print("Dataset loaded directly from sklearn!")
df.head()

Dataset loaded directly from sklearn!


Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,Malignant
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,Malignant
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,Malignant
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,Malignant
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,Malignant


In [2]:
# The prompt asks for these specific names
# Based on sklearn's naming, we map them correctly here:
selected_features = [
    'mean_radius', 
    'mean_texture', 
    'mean_perimeter', 
    'mean_area', 
    'mean_smoothness'
]

X = df[selected_features]

# Encoding the Target (Malignant=1, Benign=0)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df['diagnosis']) 

print(f"Selected Features: {X.columns.tolist()}")

Selected Features: ['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area', 'mean_smoothness']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1. Split the data FIRST (to avoid data leakage)
# Using random_state=42 ensures your results are reproducible
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Initialize the Scaler
scaler = StandardScaler()

# 3. Fit and Transform the training data
# We "fit" only on training data and "transform" both
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data split and scaling successful!")
print(f"Training set shape: {X_train_scaled.shape}")
print(f"Testing set shape: {X_test_scaled.shape}")

NameError: name 'train_test_split' is not defined

In [None]:
# 5. Implement Logistic Regression
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# 6. Evaluate the model
y_pred = model.predict(X_test_scaled)

print("--- Model Evaluation ---")
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred):.4f}")
print(f"F1-score:  {f1_score(y_test, y_pred):.4f}")

# Visualizing the results

sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Create model directory if it doesn't exist
if not os.path.exists('model'):
    os.makedirs('model')

# 7. Save the trained model and the scaler
joblib.dump(model, 'model/breast_cancer_model.pkl')
joblib.dump(scaler, 'model/scaler.pkl')

print("Model and Scaler saved successfully in /model/ folder!")

In [None]:
# Load the saved objects
loaded_model = joblib.load('model/breast_cancer_model.pkl')
loaded_scaler = joblib.load('model/scaler.pkl')

# Mock input for testing (using means)
sample_input = np.array([[17.99, 10.38, 122.8, 1001.0, 0.1184]])
sample_scaled = loaded_scaler.transform(sample_input)

prediction = loaded_model.predict(sample_scaled)
result = "Malignant" if prediction[0] == 1 else "Benign"

print(f"Reloaded model test prediction: {result}")