### import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score,f1_score
from sklearn.datasets import load_breast_cancer

In [2]:
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier,RandomForestClassifier,BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

### load dataset and EDA

In [3]:
cancer = load_breast_cancer()
x = cancer.data
y = cancer.target


In [4]:
print(x.shape)
print(cancer.target_names)
print(cancer.feature_names)

(569, 30)
['malignant' 'benign']
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [5]:
# print(cancer.DESCR)

In [6]:
dataframe = pd.DataFrame(x,columns = cancer.feature_names)
dataframe['y'] = y
dataframe.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,y
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [7]:
dataframe['y'].value_counts()

1    357
0    212
Name: y, dtype: int64

#### so the dataset is balance

### define a function for classification

In [8]:
def classification(x,y):
    x_train, x_test, y_train, y_test = train_test_split (x,y)
    m1 = AdaBoostClassifier ()
    m2 = GradientBoostingClassifier()
    m3 = RandomForestClassifier()
    m4 = LogisticRegression (max_iter = 10000)
    m5 = MultinomialNB ()
    m6 = SVC ()
    models = [m1, m2, m3, m4, m5, m6]

    recalls = []
    f1 = []
    
    for model in models:
        y_pred = model.fit (x_train, y_train).predict (x_test)
        recalls.append(recall_score(y_test,y_pred,average = 'weighted'))
        f1.append(f1_score(y_test,y_pred,average = 'weighted'))
        
        
    print('weighted_recalls = {} \n'.format(recalls))
    print('weighted_f1 = {}  \n' .format (f1))
    print('sorted_f1: {} \n '. format (np.argsort(f1)))
              
    best_model = models[np.argsort(f1)[-1]]      
    print('best model: {} \n'.format(str(best_model)[:-2]))
          
    y_pred = best_model.fit (x_train, y_train).predict (x_test)
    print(classification_report (y_pred,y_test))

    

In [9]:
classification(x,y)

weighted_recalls = [0.965034965034965, 0.951048951048951, 0.9790209790209791, 0.972027972027972, 0.916083916083916, 0.9370629370629371] 

weighted_f1 = [0.9649472329884701, 0.9511652934112828, 0.9789683397930821, 0.972027972027972, 0.9140668854954569, 0.9353183746613278]  

sorted_f1: [4 5 1 0 3 2] 
 
best model: RandomForestClassifier 

              precision    recall  f1-score   support

           0       0.96      0.98      0.97        48
           1       0.99      0.98      0.98        95

    accuracy                           0.98       143
   macro avg       0.97      0.98      0.98       143
weighted avg       0.98      0.98      0.98       143



## feature selection

#### RFE for classification

In [14]:

from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline


rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)
model = RandomForestClassifier()
pipeline = Pipeline(steps=[('s',rfe),('m',model)])

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(pipeline, x, y, scoring='f1_macro', cv=cv, n_jobs=-1, error_score='raise')

print('f1: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.938 (0.037)


### filter method

In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [12]:
fs = SelectKBest(score_func=f_classif, k=15)
X_selected = fs.fit_transform(x, y)
print(X_selected.shape)

(569, 15)


In [13]:
classification(X_selected,y)

weighted_recalls = [0.9230769230769231, 0.9230769230769231, 0.9230769230769231, 0.9440559440559441, 0.9020979020979021, 0.9230769230769231] 

weighted_f1 = [0.9232597467891587, 0.9235956443877237, 0.9235956443877237, 0.9440559440559441, 0.8990075580984671, 0.9220055113419646]  

sorted_f1: [4 5 0 1 2 3] 
 
best model: LogisticRegression(max_iter=1000 

              precision    recall  f1-score   support

           0       0.92      0.92      0.92        49
           1       0.96      0.96      0.96        94

    accuracy                           0.94       143
   macro avg       0.94      0.94      0.94       143
weighted avg       0.94      0.94      0.94       143

