In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
df = pd.read_csv("datasets/breast_cancer_dataset.csv")
df.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [3]:
## Independent and dependent features
X = df.drop(labels=['target'],axis=1)
y = df[['target']]

In [4]:
y

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
564,0
565,0
566,0
567,0


In [5]:
# Define which columns should be should be scaled
# Here all are numerical columns 
numerical_cols = X.select_dtypes(exclude='object').columns
categorical_cols = X.select_dtypes(include='object').columns

In [6]:
numerical_cols

Index(['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area',
       'mean_smoothness', 'mean_compactness', 'mean_concavity',
       'mean_concave_points', 'mean_symmetry', 'mean_fractal_dimension',
       'radius_error', 'texture_error', 'perimeter_error', 'area_error',
       'smoothness_error', 'compactness_error', 'concavity_error',
       'concave_points_error', 'symmetry_error', 'fractal_dimension_error',
       'worst_radius', 'worst_texture', 'worst_perimeter', 'worst_area',
       'worst_smoothness', 'worst_compactness', 'worst_concavity',
       'worst_concave_points', 'worst_symmetry', 'worst_fractal_dimension'],
      dtype='object')

In [7]:
categorical_cols

Index([], dtype='object')

In [8]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [9]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])


In [10]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)

In [11]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [12]:
X_train.head()

Unnamed: 0,num_pipeline__mean_radius,num_pipeline__mean_texture,num_pipeline__mean_perimeter,num_pipeline__mean_area,num_pipeline__mean_smoothness,num_pipeline__mean_compactness,num_pipeline__mean_concavity,num_pipeline__mean_concave_points,num_pipeline__mean_symmetry,num_pipeline__mean_fractal_dimension,...,num_pipeline__worst_radius,num_pipeline__worst_texture,num_pipeline__worst_perimeter,num_pipeline__worst_area,num_pipeline__worst_smoothness,num_pipeline__worst_compactness,num_pipeline__worst_concavity,num_pipeline__worst_concave_points,num_pipeline__worst_symmetry,num_pipeline__worst_fractal_dimension
0,1.672478,0.092236,1.575466,1.743546,-1.155393,-0.388776,0.308821,0.696888,0.385896,-1.654663,...,1.370805,-0.436768,1.24508,1.355167,-1.173657,-0.666127,-0.083911,0.394324,0.228388,-1.365427
1,-1.190891,-0.411908,-1.16964,-1.018491,0.307257,-0.628567,-0.836067,-0.839173,0.250321,0.086205,...,-1.094246,0.196134,-1.086601,-0.915688,0.345398,-0.752389,-0.836893,-0.780619,-0.023614,-0.406564
2,-0.56214,-0.3196,-0.593369,-0.573159,-0.680924,-0.783521,-0.747404,-0.612545,0.521471,-0.948449,...,-0.485233,0.535011,-0.579248,-0.512179,-0.361748,-0.501272,-0.680568,-0.320859,0.85448,-0.831987
3,-0.655603,0.352592,-0.615597,-0.645135,-0.738003,-0.089328,-0.321443,-0.318807,-0.827142,0.699385,...,-0.66338,0.270886,-0.483858,-0.626524,-0.903021,0.149203,-0.200912,-0.014959,-0.695099,0.876905
4,-0.11465,0.078035,-0.170634,-0.210412,-0.681637,-0.821485,-0.984133,-0.705467,-0.944878,-0.841318,...,-0.168298,0.39049,-0.224517,-0.2644,-0.894291,-0.919161,-1.118326,-0.872935,-0.619967,-0.823678


In [13]:
## Model Training

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [14]:
regression = LogisticRegression()
regression.fit(X_train,y_train)

In [15]:
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    confusionmat = confusion_matrix(true, predicted)
    report = classification_report(true, predicted)
    return accuracy,confusionmat,report

In [16]:
y_pred = regression.predict(X_test)
accuracyy,confusionmatt,reportt = evaluate_model(y_test,y_pred)

In [22]:
print('Model Training Performance')
print("\nConfusion Matrix:\n",confusionmatt)
print("\nClassification Report: \n\n",reportt)
print("\nAccuracy:",accuracyy)

Model Training Performance

Confusion Matrix:
 [[46  1]
 [ 1 95]]

Classification Report: 

               precision    recall  f1-score   support

           0       0.98      0.98      0.98        47
           1       0.99      0.99      0.99        96

    accuracy                           0.99       143
   macro avg       0.98      0.98      0.98       143
weighted avg       0.99      0.99      0.99       143


Accuracy: 0.986013986013986


In [23]:
models={
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(kernel='linear'),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Neural Network (MLP)": MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
}
trained_model_list=[]
model_list=[]
accuracy_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    accuracy,confusionmat,report =evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    #print('Model Training Performance')
    print("\nConfusion Matrix:\n",confusionmat)
    print("\nClassification Report: \n\n",report)
    print("\nAccuracy:",accuracy*100)

    accuracy_list.append(accuracy)
    
    print('='*35)


Logistic Regression

Confusion Matrix:
 [[46  1]
 [ 1 95]]

Classification Report: 

               precision    recall  f1-score   support

           0       0.98      0.98      0.98        47
           1       0.99      0.99      0.99        96

    accuracy                           0.99       143
   macro avg       0.98      0.98      0.98       143
weighted avg       0.99      0.99      0.99       143


Accuracy: 98.6013986013986
Support Vector Machine

Confusion Matrix:
 [[46  1]
 [ 0 96]]

Classification Report: 

               precision    recall  f1-score   support

           0       1.00      0.98      0.99        47
           1       0.99      1.00      0.99        96

    accuracy                           0.99       143
   macro avg       0.99      0.99      0.99       143
weighted avg       0.99      0.99      0.99       143


Accuracy: 99.3006993006993
Naive Bayes

Confusion Matrix:
 [[39  8]
 [ 5 91]]

Classification Report: 

               precision    recall  f1

In [25]:
my_dict = dict(zip(model_list, accuracy_list))
my_dict

{'Logistic Regression': 0.986013986013986,
 'Support Vector Machine': 0.993006993006993,
 'Naive Bayes': 0.9090909090909091,
 'Random Forest': 0.9790209790209791,
 'Gradient Boosting': 0.965034965034965,
 'Decision Tree': 0.9440559440559441,
 'Neural Network (MLP)': 1.0}

In [26]:
'''
import pickle

# Find the index of the model with the highest R-squared score
best_model_index = accuracy_list.index(max(accuracy_list))

# Get the best model from the list of trained models
best_model = list(models.values())[best_model_index]

# Save the best model to a file using pickle
with open('bestt_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)

# Loading the best model
with open('bestt_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file) 

'''
print()




In [27]:
# Create a dictionary of classifiers and their respective hyperparameter grids for tuning
classifiers = {
    "Logistic Regression": {
        "model": LogisticRegression(),
        "params": {
            "penalty": ["l1", "l2"],
            "C": [0.01, 0.1, 1, 10]
        }
    },
    "Support Vector Machine": {
        "model": SVC(),
        "params": {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf"]
        }
    },
    "Naive Bayes": {
        "model": GaussianNB(),
        "params": {}
    },
    "Random Forest": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4]
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(),
        "params": {
            "n_estimators": [50, 100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 4, 5]
        }
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(),
        "params": {
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4]
        }
    },
    "Neural Network (MLP)": {
        "model": MLPClassifier(),
        "params": {
            "hidden_layer_sizes": [(50, 50), (100, 50), (100, 100)],
            "alpha": [0.0001, 0.001, 0.01],
            "max_iter": [1000]
        }
    }
}

In [28]:
from sklearn.model_selection import GridSearchCV
best_params_list = []
# Perform hyperparameter tuning and evaluation for each classifier
for name, clf_info in classifiers.items():
    model = clf_info["model"]
    params = clf_info["params"]

    grid_search = GridSearchCV(model, params, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Make predictions with the best model
    y_pred = best_model.predict(X_test)

    # Evaluate the model and store results
    accuracy, confusionmat, report = evaluate_model(y_test, y_pred)
    
    # Append best params and accuracy to lists
    best_params_list.append((name, best_params))
    accuracy_list.append((name, accuracy))

    print(f"Classifier: {name}")
    print(f"Best Parameters: {best_params}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    print("=" * 50)

Classifier: Logistic Regression
Best Parameters: {'C': 1, 'penalty': 'l2'}
Accuracy: 0.986013986013986
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        47
           1       0.99      0.99      0.99        96

    accuracy                           0.99       143
   macro avg       0.98      0.98      0.98       143
weighted avg       0.99      0.99      0.99       143

Classifier: Support Vector Machine
Best Parameters: {'C': 0.1, 'kernel': 'linear'}
Accuracy: 0.986013986013986
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        47
           1       0.98      1.00      0.99        96

    accuracy                           0.99       143
   macro avg       0.99      0.98      0.98       143
weighted avg       0.99      0.99      0.99       143

Classifier: Naive Bayes
Best Parameters: {}
Accuracy: 0.9090909090909091
Classificatio

In [30]:
# Print the best parameters and accuracy for each classifier
print("Best Parameters for Each Classifier:")
for name, best_params in best_params_list:
    print(f"Classifier: {name}")
    print(f"Best Parameters: {best_params}")
    print("=" * 50)


Best Parameters for Each Classifier:
Classifier: Logistic Regression
Best Parameters: {'C': 1, 'penalty': 'l2'}
Classifier: Support Vector Machine
Best Parameters: {'C': 0.1, 'kernel': 'linear'}
Classifier: Naive Bayes
Best Parameters: {}
Classifier: Random Forest
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Classifier: Gradient Boosting
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Classifier: Decision Tree
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
Classifier: Neural Network (MLP)
Best Parameters: {'alpha': 0.001, 'hidden_layer_sizes': (50, 50), 'max_iter': 1000}
