Dataset Link : https://www.kaggle.com/datasets/marshuu/breast-cancer?select=breast_cancer.csv

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
df = pd.read_csv('breast_cancer.csv')
df.head()

In [None]:
df.columns = map(str.lower,df.columns)

# EDA

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.corr()['class'].sort_values(ascending=False)[1:]

In [None]:
sns.heatmap(df.corr(),annot=True,cmap=sns.cubehelix_palette(as_cmap=True));

In [None]:
sns.countplot(df['class']);

In [None]:
sns.pairplot(df,hue='class');

In [None]:
sns.boxplot(data=df);

In [None]:
plt.pie(df['class'].value_counts(),labels = [2,4],textprops={'size':12},autopct='%.2f%%',startangle=60)
plt.title('Class',size = 14)
plt.show()

## Create Independent and Dependent Variables

In [None]:
X = df.drop('class',axis = 1).values
y = df['class'].values

## Split the Data Into the Train and Test Sets

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

## Machine Learning Models

In [None]:
def display_result(y_pred,modelName):
    print(modelName.center(75,'_'),end='\n\n')

    print('Accuracy Score:',round(accuracy_score(y_test,y_pred),3),end='\n\n')
    print('Classification Report'.center(75,'_'),end='\n\n')
    print(classification_report(y_test,y_pred),end='\n\n')

    print('Confusion Matrix'.center(75,'_'),end='\n\n')
    print(confusion_matrix(y_test,y_pred))

In [None]:
def grid_search(estimator,param_grid):
    gridSearch = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv = 10,
        n_jobs=-1)
    
    
    gridSearch.fit(X_train,y_train)
    print('Best Parameters:',gridSearch.best_params_)

### Random Forest Classifier

In [None]:
rf_model = RandomForestClassifier(random_state=0).fit(X_train,y_train)
rf_params = {
    'n_estimators':[100,200,500,1000],
    'criterion':['entropy','gini'],
    'max_depth':[None,3,4,5]
}
grid_search(rf_model,rf_params)

In [None]:
rf_model_tuned = RandomForestClassifier(n_estimators=500,criterion='gini',max_depth=3,random_state=0).fit(X_train,y_train)
rf_pred = rf_model_tuned.predict(X_test)
display_result(rf_pred,'Random Forest')

### Decision Tree Classifier

In [None]:
dt_model = DecisionTreeClassifier(random_state=0).fit(X_train,y_train)
dt_params = {
    'criterion':['entropy','gini'],
    'max_depth':[None,3,4,5],
    'min_samples_split':[2,5,10]
}
grid_search(dt_model,dt_params)

In [None]:
dt_model_tuned = DecisionTreeClassifier(criterion='entropy',max_depth=3,min_samples_split=2,random_state=0).fit(X_train,y_train)
dt_pred = dt_model_tuned.predict(X_test)
display_result(dt_pred,'Decision Tree')

### K-Nearest Neighbors Classifier

In [None]:
knn_model = KNeighborsClassifier().fit(X_train,y_train)
knn_params = {
    'n_neighbors':np.arange(1,50),
    'weights':['uniform','distance']
}
grid_search(knn_model,knn_params)

In [None]:
knn_model_tuned = KNeighborsClassifier(n_neighbors=3,weights='uniform').fit(X_train,y_train)
knn_pred = knn_model_tuned.predict(X_test)
display_result(knn_pred,'K-Nearest Neighbors')

### Naive Bayes Classifier

In [None]:

nb_model = GaussianNB().fit(X_train,y_train)
nb_pred = nb_model.predict(X_test)
display_result(nb_pred,'Naive Bayes')


### Neural Network Classifier


In [None]:
mlp_model = MLPClassifier(random_state=0).fit(X_train,y_train)
mlp_params = {
    'hidden_layer_sizes':[(100,0),(10,10),(10,5)],
    'solver':['adam','lbfgs','sgd'],
    'activation':['relu','logistic'],
    'alpha':[0.0001,0.001,0.01]
}
grid_search(mlp_model,mlp_params)

In [None]:
mlp_model_tuned = MLPClassifier(random_state=0,activation='relu',alpha=0.001,hidden_layer_sizes =(10,5),solver='adam')
mlp_model_tuned.fit(X_train,y_train)

mlp_pred = mlp_model_tuned.predict(X_test)
display_result(mlp_pred,'Multilayer Perceptron')



### Logistic Regression Classifier


In [None]:
lr_model = LogisticRegression(random_state=0).fit(X_train,y_train)
lr_params = {
    'C':np.arange(1,6),
    'tol':[0.0001,0.001,0.01],
    'solver':['adam','lbfgs','sgd']
}
grid_search(lr_model,lr_params)


In [None]:
lr_model_tuned = LogisticRegression(C=1, solver='lbfgs', tol = 0.0001).fit(X_train,y_train)
lr_pred = lr_model_tuned.predict(X_test)
display_result(lr_pred,'Logistic Regression')


### XGBoost Classifier

In [None]:
le = LabelEncoder()

y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [None]:
xgb_model = XGBClassifier().fit(X_train,y_train_encoded)
xgb_params = {
    'n_estimators':[100,200,500,1000],
    'max_depth':[None,3,4,5],
    'learning_rate': [0.1, 0.01, 0.02],
    'subsample': [0.6, 0.8, 1.0]
}

gridSearch = GridSearchCV(estimator=xgb_model,param_grid=xgb_params,cv = 10,n_jobs=-1)
gridSearch.fit(X_train,y_train_encoded)
print('Best Parameters:',gridSearch.best_params_)

In [None]:
xgb_model_tuned = XGBClassifier(learning_rate = 0.1, max_depth = None, n_estimators = 200, subsample = 0.8)
xgb_model_tuned.fit(X_train,y_train_encoded)
xgb_pred = xgb_model_tuned.predict(X_test)
xgb_pred[xgb_pred== 1] = 4
xgb_pred[xgb_pred== 0] = 2
display_result(xgb_pred,'XGBoost')


### LightGBM Classifier


lgbm_model = LGBMClassifier(random_state=0).fit(X_train,y_train)
lgbm_params = {
    'n_estimators': [100, 500, 1000],
    'subsample': [0.6, 0.8, 1.0],
    'max_depth': [-1, 3, 4, 5],
    'learning_rate': [0.1,0.01,0.02,0.05],
    "min_child_samples": [5,10,20]
}
grid_search(lgbm_model,lgbm_params)

lgbm_model_tuned = LGBMClassifier(random_state=0,learning_rate=0.05,max_depth=4,min_child_samples=20,n_estimators=500,subsample=0.6)
lgbm_model_tuned.fit(X_train,y_train)
lgbm_pred = lgbm_model_tuned.predict(X_test)
display_result(lgbm_pred,'LightGBM')

# Predict single value


models = [
    rf_model_tuned,
    dt_model_tuned,
    knn_model_tuned,
    nb_model,
    mlp_model_tuned,
    lr_model_tuned,
    xgb_model_tuned,
    lgbm_model_tuned
]

class_names = {4:'Maligant',2:'Benign'}

def select_model():
    print('\n1. Random Forest\n2. Decision Tree\n3. KNN\n4. Naive Bayes\n5. MLP\n6. Logistic Regression\n7. XGBoost\n8. LightGBM')
    while True:
        try:
            model_id = int(input('Select machine learning model id: '))
            if model_id <1 or model_id >8:
                continue
            else:
                break
        except Exception as ex:
            print(ex)
            choice = input("\nPress any key to continue or press 'e' to exit: ").lower()
            if choice == 'e':
                model_id = 0
                break

    return model_id

i = 1
while True:
    try:
        print('\n')
        print(f'Prediction {i}'.center(50,'_'),end='\n\n')
        clump_thickness = int(input('Clump Thickness: '))
        uniformity_of_cell_size = int(input('Uniformity of cell size: '))
        uniformity_of_cell_shape = int(input('Uniformity of cell shape: '))
        marginal_adhesion = int(input('Marginal adhesion: '))
        single_epithelial_cell_size = int(input('Single epithelial cell size: '))
        bare_nuclei = int(input('Bare nuclei: '))
        bland_chromatin = int(input('Bland chromatin: '))
        normal_nucleoli = int(input('Normal nucleoli: '))
        mitoses = int(input('Mitoses: '))

        values = [[
            clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,
            single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses
        ]]

        model_id = select_model()-1
        if model_id == -1:
            break

        prediction = models[model_id].predict(values)[0]
        if model_id == 6: #xgboost
            prediction = (prediction+1)*2

        print('\nPredicted class is:',class_names[prediction])
        i+=1

    except Exception as ex:
        print(ex)

    finally:
        choice = input("\nPress any key to continue or press 'e' to exit: ").lower()
        if choice == 'e':
            break