In [1]:
#Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

In [2]:
#Import Dataset
dataset = pd.read_csv('\BreastCancer.csv')

# Shuffle the dataset so that the future splitting into k-fold is random.
#dataset = dataset.sample (frac = 1)

dataset.head()

Unnamed: 0,Serial,Cl.thickness,Cell.size,Cell.shape,Marg.adhesion,Epith.c.size,Bare.nuclei,Bl.cromatin,Normal.nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,0
1,1002945,5,4,4,5,7,10.0,3,2,1,0
2,1015425,3,1,1,1,2,2.0,3,1,1,0
3,1016277,6,8,8,1,3,4.0,3,7,1,0
4,1017023,4,1,1,3,2,1.0,3,1,1,0


In [3]:
dataset = dataset.dropna()
X = dataset.iloc[:,[i for i in range (1, 10)]].values
y = dataset.iloc[:,10].values

In [4]:
def LOOCV (x, y):
    """Leave one out cross validation. Uses logistic regression for classifying two classes."""
    
    #Feature Scaling
    sc_X=StandardScaler()
    x = sc_X.fit_transform(x)
    
    classifier = LogisticRegression(solver='lbfgs')
    mistake = 0
    
    for i in range(x.shape[0]):
        x_train = np.delete(x, i, axis = 0)
        y_train = np.delete(y, i, axis = 0)
        classifier.fit(x_train, y_train)
        
        x_test = x[i]
        # To reshape x_test into a 2-D array
        x_test = x_test.reshape(1, -1)
        
        y_pred = classifier.predict(x_test)
        
        if y_pred != y[i]:
            mistake += 1
            
    print ('mean accuracy for the test data is %0.4f' %(100 - 100*mistake / x.shape[0]), '%')

In [5]:
LOOCV (X, y)

mean accuracy for the test data is 96.6325 %


In [6]:
def kFCV (x, y, k):
    """k-Fold cross validation. Uses logistic regression for classifying two classes."""
    
    #Feature Scaling
    sc_X=StandardScaler()
    x = sc_X.fit_transform(x)
    
    classifier = LogisticRegression(solver='lbfgs')
    mistake = 0
    
    n = x.shape[0]
    d, r = n//k , n % k
    for i in range(0, n-r, d):
        fold_i = [j for j in range (i, i+d)]
        if i//d < r:    
            fold_i.append (n-1+(-i// d))

        x_train = np.delete(x, fold_i, axis = 0)
        y_train = np.delete (y, fold_i, axis = 0)
        classifier.fit(x_train, y_train)
        
        x_test = x[fold_i]
        y_pred = classifier.predict (x_test)
        for j in range (len (fold_i)):
            if y_pred[j] != y[fold_i][j]:
                mistake += 1
            
    print ('mean accuracy for the test data is %0.4f' %(100 - 100 *mistake / x.shape[0]), '%')    

In [7]:
for i in range (2, 11):
    print (kFCV (X, y, i))

mean accuracy for the test data is 96.4861 %
None
mean accuracy for the test data is 96.3397 %
None
mean accuracy for the test data is 96.3397 %
None
mean accuracy for the test data is 96.6325 %
None
mean accuracy for the test data is 96.3397 %
None
mean accuracy for the test data is 96.7789 %
None
mean accuracy for the test data is 96.6325 %
None
mean accuracy for the test data is 96.9253 %
None
mean accuracy for the test data is 96.7789 %
None
