# Project Breast Cancer detection in Wisconsin #


In [None]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline
import sys
import matplotlib

print('python:',sys.version)
print('Numpy:',np.__version__)
print('Pandas:',pd.__version__)
print('Scikitlearn:',sklearn.__version__)
print('Seaborn: ',sns.__version__)
print('matplotlib:',matplotlib.__version__)


In [None]:
Cancer = pd.read_csv(r'breast-cancer-wisconsin.csv')

In [None]:
Cancer.info()
Cancer.describe()
#Cancer['diagnosis'].value_counts()


In [None]:
col_=Cancer.columns
col=list(col_)
col


In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(Cancer[col].corr(), annot=True, fmt='.3f')


## 1) Preprocessing ##

In [None]:

col_categorical=list(set(col).difference(Cancer.describe()))
col_categorical


In [None]:
#Cancer.drop(['Unnamed:32','id'],axis=1)
key={'B':0, 'M':1}
Cancer['Diag']=Cancer['diagnosis'].map(key)


In [None]:
Cancer2=Cancer.drop(['Unnamed: 32', 'id', 'diagnosis'], axis=1)
Cancer2


In [None]:
X=Cancer2.iloc[:,:30]
Y=Cancer2['Diag']
X.describe()


In [None]:
from sklearn.preprocessing import StandardScaler

scale = pd.DataFrame(StandardScaler().fit_transform(X),columns=X.columns)
scale.describe()


In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train, Y_test=train_test_split(scale,Y,random_state=69,test_size=0.4)


In [None]:
np.info(Y_test)
#X_train
Y_test


In [None]:
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score

def print_score(clf, X_train, X_test, Y_train, Y_test, train=True):
    "Print the accuracy score, Classification report and confusion matrix"
    LB=preprocessing.LabelBinarizer()
    LB.fit(Y_train)
    if train:
        "Train performance"
        res=clf.predict(X_train)
        print(f'Train Results:\n')
        print(f'Accuracy Score: %.4f \n' % (accuracy_score(Y_train,res)))
        print(f'ROC AUC Score:%.4f'%(roc_auc_score(LB.transform(Y_train),LB.transform(res))))
        print(f'Classification report:\n {classification_report(Y_train,res)}')
        print(f'Confusion Matrix Score: {confusion_matrix(Y_train,res)}')

        res=cross_val_score(clf,X_train, Y_train, cv=10, scoring='accuracy')
        print('Cross Val. average accuracy \t: %.4f'%(np.mean(res)))
        print('Cross Val. accuracy SD \t: %.4f '% (np.std(res)))


    elif train==False:
        "Test performance"
        res_test=clf.predict(X_test)
        print(f'Test Results:\n')
        print('Accuracy Score: %.4f \n'%(accuracy_score(Y_test,res_test)))
        print('ROC AUC Score: %.4f\n'%(roc_auc_score(Y_test,res_test)))
        print(f'Classification report:\n {classification_report(Y_test,res_test)}')
        print(f'Confusion Matrix Score: {confusion_matrix(Y_test,res_test)}')
        print('Cross Val. average accuracy \t: %.4f'%(np.mean(res_test)))
        print('Cross Val. average accuracy SD \t: %.4f '% (np.std(res_test)))




## 2) Model Fitting ##

### a \ Decision Tree: ###

In [None]:
from sklearn.tree import DecisionTreeClassifier

DT_clf=DecisionTreeClassifier(random_state=24)
DT_clf.fit(X_train, Y_train.ravel())


In [None]:
print(f' {print_score(DT_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=True)}')
print(f'\n ----------------------------------------------->\n')
print(f'{print_score(DT_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=False)}')


### b \ Random Forest: ###

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

RF_clf=RandomForestClassifier(n_estimators=100)
RF_clf.fit(X_train, Y_train.ravel())


In [None]:
print(f' {print_score(RF_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=True)}')
print(f'\n ---------------------------------------\n')
print(f'{print_score(RF_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=False)}')


### C\ Bagging + Decision Tree ###

In [None]:
from sklearn.ensemble import BaggingClassifier

bag_clf=BaggingClassifier(estimator=DT_clf, n_estimators=100, bootstrap=True,
                          oob_score=False, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, Y_train.ravel())


In [None]:
print(f' {print_score(bag_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=True)}')
print(f'\n ---------------------------------------\n')
print(f'{print_score(bag_clf, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=False)}')


### D \ AdaBoost + Random Forest: ###

In [None]:
Ada_RF=AdaBoostClassifier(RandomForestClassifier(n_estimators=200), n_estimators=100)
Ada_RF.fit(X_train, Y_train.ravel())


In [None]:
print(f' {print_score(Ada_RF, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=True)}')
print(f'\n ---------------------------------------\n')
print(f'{print_score(Ada_RF, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=False)}')


### E\ KNN + Grid Search : ###

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5,p=2, metric='minkowski')
knn.fit(X_train, Y_train)


In [None]:
print(f' {print_score(knn, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=True)}')
print(f'\n ---------------------------------------\n')
print(f'{print_score(knn, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=False)}')


### Grid Search : ###

In [None]:
from sklearn.model_selection import GridSearchCV

knn.get_params()


In [None]:
params = {'n_neighbors': [1, 2, 3, 4, 5,6,7,8,9,10]}
grid_search_cv = GridSearchCV(KNeighborsClassifier(),
                              params, 
                              n_jobs=-1,
                              verbose=1)
grid_search_cv.fit(X_train, Y_train)
grid_search_cv.best_estimator_


In [None]:
print(f' {print_score(grid_search_cv, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=True)}')
print(f'\n ---------------------------------------\n')
print(f'{print_score(grid_search_cv, X_train, X_test, Y_train.ravel(), Y_test.ravel(), train=False)}')


### In conclusion: ###
best score is wth a model built with Decision Tree algorithm, best True positive prediction:<br>
f1= 91%