# Regression model selection

## Importing the libraries

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, accuracy_score

## Importing the dataset

In [8]:
dataset = pd.read_csv('Data_classification.csv')
X = dataset.iloc[:, :-1].values #get all columns but the last
y = dataset.iloc[:, -1].values  #get the last column

In [9]:
print(dataset)

     Sample code number  Clump Thickness  Uniformity of Cell Size  \
0               1000025                5                        1   
1               1002945                5                        4   
2               1015425                3                        1   
3               1016277                6                        8   
4               1017023                4                        1   
..                  ...              ...                      ...   
678              776715                3                        1   
679              841769                2                        1   
680              888820                5                       10   
681              897471                4                        8   
682              897471                4                        8   

     Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \
0                           1                  1                            2   
1        

## Splitting the dataset into the Training set and Test set

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Scaling the data

In [11]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Training the logistic regression classifier on the Training set

In [12]:
classifier_log_reg = LogisticRegression(random_state = 0)
classifier_log_reg.fit(X_train, y_train)
y_pred_log_reg = classifier_log_reg.predict(X_test)
cm_log_reg = confusion_matrix(y_test, y_pred_log_reg)

## Training the support vector classifier on the Training set

In [13]:
classifier_SVC = SVC(kernel = 'linear', random_state = 0)
classifier_SVC.fit(X_train, y_train)
y_pred_SVC = classifier_SVC.predict(X_test)
cm_SVC = confusion_matrix(y_test, y_pred_SVC)

## Training the kernel support vector classifier on the Training set

In [14]:
classifier_kernel_svm = SVC(kernel = 'rbf', random_state = 0)
classifier_kernel_svm.fit(X_train, y_train)
y_pred_kernel_svm = classifier_kernel_svm.predict(X_test)
cm_kernel_svm = confusion_matrix(y_test, y_pred_kernel_svm)

## Training the KNN classifier on the training set

In [15]:
classifier_KNN = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_KNN.fit(X_train, y_train)
y_pred_KNN = classifier_KNN.predict(X_test)
cm_KNN = confusion_matrix(y_test, y_pred_KNN)

## Training the naive bayes classifier on the training set

In [16]:
classifier_NB = GaussianNB()
classifier_NB.fit(X_train, y_train)
y_pred_NB = classifier_NB.predict(X_test)
cm_NB = confusion_matrix(y_test, y_pred_NB)

## Training the Decision Tree classifier model on the Training set

In [17]:
classifier_DT = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_DT.fit(X_train, y_train)
y_pred_DT = classifier_DT.predict(X_test)
cm_DT = confusion_matrix(y_test, y_pred_DT)

## Training the Random Forest classifier on the training set

In [18]:
classifier_RF = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_RF.fit(X_train, y_train)
y_pred_RF = classifier_RF.predict(X_test)
cm_RF = confusion_matrix(y_test, y_pred_DT)

## Evaluating the Model Performance

In [19]:
data = [['Logistic regression', accuracy_score(y_test, y_pred_log_reg), cm_log_reg[0][0], cm_log_reg[0][1], cm_log_reg[1][0], cm_log_reg[1][1]],
['Support vector', accuracy_score(y_test, y_pred_SVC), cm_SVC[0][0], cm_SVC[0][1], cm_SVC[1][0], cm_SVC[1][1]],
['Kernel support vector', accuracy_score(y_test, y_pred_kernel_svm), cm_kernel_svm[0][0], cm_kernel_svm[0][1], cm_kernel_svm[1][0], cm_kernel_svm[1][1]],
['K nearest neighbor', accuracy_score(y_test, y_pred_KNN), cm_KNN[0][0], cm_KNN[0][1], cm_KNN[1][0], cm_KNN[1][1]],
['Naive bayes', accuracy_score(y_test, y_pred_NB), cm_NB[0][0], cm_NB[0][1], cm_NB[1][0], cm_NB[1][1]],
['Decision tree', accuracy_score(y_test, y_pred_DT), cm_DT[0][0], cm_DT[0][1], cm_DT[1][0], cm_DT[1][1]],
['Random forest regression', accuracy_score(y_test, y_pred_RF), cm_RF[0][0], cm_RF[0][1], cm_RF[1][0], cm_RF[1][1]]] 
headers_1=["1", "2", "3", "4", "5", "6", "7"] 
headers_2=["Algorithm", "Accuracy", "TN", "FP", "FN", "TP"] 
print(pd.DataFrame(data, headers_1, headers_2))

                  Algorithm  Accuracy  TN  FP  FN  TP
1       Logistic regression  0.956204  84   3   3  47
2            Support vector  0.956204  83   4   2  48
3     Kernel support vector  0.956204  82   5   1  49
4        K nearest neighbor  0.956204  83   4   2  48
5               Naive bayes  0.948905  80   7   0  50
6             Decision tree  0.970803  85   2   2  48
7  Random forest regression  0.956204  85   2   2  48
