# Classification Model Comparison

## Importing the pandas library

In [1]:
import pandas as pd

## Importing the dataset

This breast cancer dataset was from [UCI ML Repository- Breast Cancer Wisconsin (Original) Data Set](https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)). The data was prepared by Dr. William H. Wolberg from the University of Wisconsin Hospitals, Madison.

In [2]:
# Read dataset from the csv file
dataset = pd.read_csv('Breast_Cancer_Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Splitting the original dataset into a training set and test set

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Apply feature scaling

In [4]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Training the Decision Tree model 

In [5]:
from sklearn.tree import DecisionTreeClassifier
Dec_Tree_model = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
Dec_Tree_model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

## Training the K-NN model 

In [6]:
from sklearn.neighbors import KNeighborsClassifier
K_NNghr_model = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
K_NNghr_model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

## Training the SVM model with rbf kernel

In [7]:
from sklearn.svm import SVC
SVC_rbf_model = SVC(kernel = 'rbf', random_state = 0)
SVC_rbf_model.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

## Training the Logistic Regression model 

In [8]:
from sklearn.linear_model import LogisticRegression
Log_Reg_model = LogisticRegression(random_state = 0)
Log_Reg_model .fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Training the Naive Bayes model 

In [9]:
from sklearn.naive_bayes import GaussianNB
N_Bayes_model = GaussianNB()
N_Bayes_model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Training the Random Forest model 

In [10]:
from sklearn.ensemble import RandomForestClassifier
Ran_Forest_model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
Ran_Forest_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

## Training the SVM model with linear kernel

In [11]:
from sklearn.svm import SVC
SVC_lin_model = SVC(kernel = 'linear', random_state = 0)
SVC_lin_model.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

## Making the Confusion Matrix

In [12]:
from sklearn.metrics import  accuracy_score
# Decision Tree model prediction
y_pred_DT = Dec_Tree_model.predict(X_test)
Dec_Tree_model_acc = accuracy_score(y_test, y_pred_DT)

In [13]:
# K-NN model prediction
y_pred_KNN = K_NNghr_model.predict(X_test)
KNN_model_acc = accuracy_score(y_test, y_pred_KNN)

In [14]:
# SVM model with rbf kernel prediction
y_pred_KSVM = SVC_rbf_model.predict(X_test)
SVM_rbf_model_acc = accuracy_score(y_test, y_pred_KSVM)

In [15]:
# Logistic Regression model prediction
y_pred_LR = Log_Reg_model.predict(X_test)
Log_Reg_model_acc = accuracy_score(y_test, y_pred_LR)

In [16]:
# Naive Bayes model prediction
y_pred_NB = N_Bayes_model.predict(X_test)
N_Bayes_model_acc = accuracy_score(y_test, y_pred_NB)

In [17]:
# Random Forest model prediction
y_pred_RF = Ran_Forest_model.predict(X_test)
Ran_Forest_model_acc = accuracy_score(y_test, y_pred_RF)

In [18]:
# SVM model with linear kernel prediction
y_pred_SVC_lin = SVC_lin_model.predict(X_test)
SVC_lin_model_acc = accuracy_score(y_test, y_pred_SVC_lin)

In [19]:
print("The accuracy of the Decision Tree model is {:.2f}".format(Dec_Tree_model_acc))
print("The accuracy of the K-NN model is {:.2f}".format(KNN_model_acc))
print("The accuracy of the SVM model with rbf kernel is {:.2f}".format(SVM_rbf_model_acc))
print("The accuracy of the Logistic Regression model is {:.2f}".format(Log_Reg_model_acc))
print("The accuracy of the Naive Bayes model is {:.2f}".format(N_Bayes_model_acc))
print("The accuracy of the Random Forest model is {:.2f}".format(Ran_Forest_model_acc ))
print("The accuracy of the SVM model with linear kernel is {:.2f}".format(SVC_lin_model_acc))

The accuracy of the Decision Tree model is 0.96
The accuracy of the K-NN model is 0.95
The accuracy of the SVM model with rbf kernel is 0.95
The accuracy of the Logistic Regression model is 0.95
The accuracy of the Naive Bayes model is 0.94
The accuracy of the Random Forest model is 0.94
The accuracy of the SVM model with linear kernel is 0.94
