# Breast Cancer Dectector using Supervised Learning

## Import the libraries

In [1]:
import numpy as np
import pandas as pd

## Import the dataset

In [2]:
dataset = pd.read_csv('dataset.csv')
X = dataset.iloc[:, 1:dataset.shape[1] - 1].values
y = dataset.iloc[:, dataset.shape[1] - 1].values

## Handle the missing values

We replace all missing values of a column (feature) with the most common value of that column.

In [3]:
from sklearn.impute import SimpleImputer
X = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent').fit_transform(X)

## Encode the dependent variable

We use One Hot Encoder to encode the dependent variable to make sure that the model will understand that the value 2 and 4 representing 2 classes: 2 for benign, 4 for malignant. From now on, 1 is for benign and 0 is for malignant.

In [4]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categories = 'auto')
y = onehotencoder.fit_transform(y.reshape(-1, 1)).toarray()[:,0]

## Split the dataset into the Training set and Test set

We use 15% of the dataset for Test set and 85% for Training set

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)

## Apply feature scaling

We apply feature scaling in order to avoid the domination between large value variables and small value ones.

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Fit classifier to the Training set

There are many classification models to use in this cases, such as Logitic Regression, KNN, SVM, Kernel SVM, Naive Bayes, Decision Tree, Random Forest. 

*Logistic Regression:* 

In [7]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0, solver = 'liblinear')
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

*KNN (K = 5):*

In [8]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

*Linear SVM:*

In [9]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=0,
  shrinking=True, tol=0.001, verbose=False)

*Kernel SVM (Kernel is Radial Basis Function):*

In [10]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=0,
  shrinking=True, tol=0.001, verbose=False)

*Naive Bayes:*

In [11]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

*Decision Tree:*

In [12]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(random_state = 0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

*Random Forest (with 10 Trees):*

In [13]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'gini', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

## Predict the Test set results

In [14]:
y_pred = classifier.predict(X_test)

## Make the Confusion Matrix

Confusion Matrix gives us insight into our results.

In [15]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

## Apply k-Fold Cross Validation 

We apply k-Fold Cross Validation to evaluate the performance of our model. Here we choose k = 10.

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print(f'Mean Accuracy: {(accuracies.mean()*100):.2f}%')
print(f'Standard deviation of Accuracy: {(accuracies.std()*100):.2f}%')

After testing all models, we finally achieves the results table:

<center>
    
| Model | Mean Accuracy(%) | Standard deviation(%) |
|:---------------:|:---------------:|:--------------------:|
|Logistic Regression|96.81|2.14|
|KNN|96.48|2.60|
|Linear SVM|96.31|1.91|
|Kernel SVM|96.65|2.76|
|Naive Bayes|96.48|2.14|
|Decision Tree|93.77|2.24|
|Random Forest|95.98|3.23|

</center>
