In [62]:
#calculate starting time
import time
start_time = time.clock()

In [63]:
import csv
import random
import math
import operator
import pandas as pd
import numpy as np
from operator import itemgetter

#load datasets
def loadDataset (trainfile, testfile):
    fileData = open(trainfile, "r",encoding='utf-8-sig')
    lines = fileData.readlines()
    data = []
    for line in lines:
        x = line.split()
        data.append(x)
    
    df = pd.DataFrame(data,index = None, columns = list(range (len(data[0]))) )
    df = df.apply(pd.to_numeric)
    col = len (df.columns)
    trainingSet = df.iloc[:,0:col].values.tolist()
    
    # read the test data file
    fileData1 = open(testfile, "r",encoding='utf-8-sig')
    liness = fileData1.readlines()
    
    dataT = []
    for line in liness:
        x = line.split()
        dataT.append(x)
    
    df1 = pd.DataFrame(dataT,index = None, columns = list(range (len(dataT[0]))) )
    df1 = df1.apply(pd.to_numeric)
    col1 = len (df1.columns)
    testSet = df1.iloc[:,0:col1].values.tolist()
    return trainingSet, testSet


In [64]:
#calculate Euclidean distances 
def euclidean_distance(test_sample, train_set, length):
    d = 0
    for x in range(length):
        d += np.square(test_sample[x] - train_set[x])
    return np.sqrt(d)


In [65]:
#select neighbors based on calculation
def neighbors(training_set, test_sample, K):
    d_set = []
    length = len(test_sample)-1
    for x in range(len(training_set)):
        dist = euclidean_distance(test_sample, training_set[x], length)
        d_set.append((training_set[x], dist))
    d = sorted(d_set, key=itemgetter(1))
    neighbor_set = []
    for k in range(K):
        neighbor_set.append(d[k][0])
    return neighbor_set


In [66]:
#measure runtime after implementation
print (time.clock() - start_time, "seconds")

0.07744399999999985 seconds


In [67]:
#classification with variable priors
def classification(neighbor,prior0,training_set):
    n = len(training_set)
    m = len(training_set[0]) - 1
    n0 = len([x for x in training_set if x[:][m]==0])
    n1 = len([x for x in training_set if x[:][m]==1])

    k = len(neighbor)
    l = len(neighbor[0]) - 1
    k0 = len([x for x in neighbor if x[:][l]==0])
    k1 = len([x for x in neighbor if x[:][l]==1])
    prob0 = ((k0/k)*(prior0))/(n0/n)
    prior1 = 1 - prior0
    prob1 = ((k1/k)*prior1)/(n1/n)
    if prob0 > prob1:
        cls = 0
    else:
        cls = 1
    return cls, prior0


In [68]:
#measure runtime
print (time.clock() - start_time, "seconds")

0.10956099999999935 seconds


In [69]:
#run KNN altogether for predictions
def KNN(training_set,test_set,K,prior0):
    prediction=[]
    for x in range(len(test_set)):
        neighbor_data = neighbors(training_set, test_set[x], K)
        result,prior0 = classification(neighbor_data,prior0,training_set )
        prediction.append(result)
    return prior0,prediction



In [70]:
#measure runtime
print (time.clock() - start_time, "seconds")

0.12535199999999946 seconds


In [71]:
#calculate TP,TN,FP,FN values
def performance_measure(test_set, predicted_class):
    n = len(test_set[0])-1
    true_class = []
    for i in range(len(test_set)):
        x = test_set[i][n]
        true_class.append(x)   
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for i in range(len(predicted_class)): 
        if true_class[i]==predicted_class[i]==1:
            TP += 1
        if true_class[i]==predicted_class[i]==0:
            TN += 1
        if predicted_class[i]==1 and true_class[i]!=predicted_class[i]:
            FP += 1
        if predicted_class[i]==0 and true_class[i]!=predicted_class[i]:
            FN += 1      
    posneg = [TP, FP, TN, FN]
    
    return posneg


In [72]:
trainingSet, testSet = loadDataset ('nX_PIMA_TR.txt','nX_PIMA_TE.txt')
prior0,predictions = KNN(trainingSet,testSet,15,132/200)
performance_nX = performance_measure(testSet, predictions)

trainingSetp, testSetp = loadDataset ('pX_PIMA_TR.txt','pX_PIMA_TE.txt')
prior0p,predictionsp = KNN(trainingSetp,testSetp,11,132/200)
performance_pX = performance_measure(testSetp, predictionsp)

trainingSetf, testSetf = loadDataset ('fX_PIMA_TR.txt','fX_PIMA_TE.txt')
prior0f,predictionsf = KNN(trainingSetf,testSetf,7,132/200)
performance_fX = performance_measure(testSetf, predictionsf)

knn = performance_nX
knnP = performance_pX
knnF = performance_fX

# print (knn)
# print (knnP)
# print (knnF)

In [73]:
#calculate accuracy (the probability of a correct decision)
def getAccuracy(performance_list):
    acc = (performance_list[0]+performance_list[2])/sum(performance_list)
    return acc

In [74]:
acc_knn_N = getAccuracy (knn)
acc_knn_P = getAccuracy (knnP)
acc_knn_F = getAccuracy (knnF)

# print (acc_knn_N)
# print (acc_knn_P)
# print (acc_knn_F)

In [75]:
#calculate accuracy for diffrent K
def vary_k(trainingSet,testSet,prior0):
    K = [x for x in range (1,16)]
    accuracy = []
    for k in K:
        prior0,predictions = KNN(trainingSet,testSet,k,prior0)
        performance = performance_measure(testSet, predictions)
        acc = getAccuracy (performance)    
        accuracy.append(acc)
    return K, accuracy

# vary_k (trainingSet,testSet,1)

In [76]:
#calculate sensitivity,specificity, TPR, FPR
def performance(performance_list):
    sensitivity = performance_list[0]/(performance_list[0]+performance_list[3])
    specificity = performance_list[2]/(performance_list[2]+performance_list[1])
    TPR = sensitivity
    FPR = 1 - specificity
    acc = [sensitivity,specificity, TPR, FPR]
    return acc

# print(performance(knn))
# print(performance(knnP))
# print(performance(knnF))

In [77]:
#calculate sensitivity,specificity, TPR, FPR for different prior probabilities
def perf_list(trainingSet,testSet,k):
    prior0 = []
    x = 0
    while x<=1:
        x+=.005
        prior0.append(x)

    perf = []
    for p in prior0:
        pri,predictions = KNN(trainingSet,testSet,k,p)
        perform = performance_measure(testSet, predictions)
        acc = performance(perform)
        perf.append(acc)
        
    return perf

# perf_list(trainingSet,testSet,12)

In [78]:
import matplotlib.pyplot as plt
from sklearn.metrics import auc

#plot ROC curve and calculate the area under the curve
def plot_roc(perf_list):
    plt.figure(figsize=(6,5))
    lw = 2

    TPR = []
    FPR = []
    for i in range(len(perf_list)):
        x = perf_list[i][2]
        TPR.append(x)
        y = perf_list[i][3]
        FPR.append(y)
        
    roc_auc = auc(FPR, TPR) 

    plt.plot(FPR, TPR, color='darkorange',lw=lw,label='ROC curve (area = %0.2f)' % roc_auc)
#     plt.plot([max(FPR),1], [max(TPR),1], color='darkorange', lw=lw)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic curve')
    plt.legend(loc="lower right")
    plt.show()


In [79]:
#plot sensitivity and specificity curve against different priors
def plot_sens_spec(perf_list):
    plt.figure(figsize=(6,5))
    lw = 2
    probClass1 = []
    x = 0
    while x<1:
        x+=.005
        probClass1.append(x)

    sensitivity = []
    specificity = []
    for i in range(len(perf_list)):
        x = perf_list[i][0]
        sensitivity.append(x)
        y = perf_list[i][1]
        specificity.append(y)
        
    line1 = plt.plot(probClass1, sensitivity,'darkorange',lw = lw,label ='sensitivity')
    line2 = plt.plot(probClass1, specificity,'b',lw = lw,label ='specificity')

    plt.xlabel('Prior Probabilities')
#     plt.ylabel('Accuracy')
    plt.legend(loc="lower right")
    plt.title('Sensitivity & Specificity Curve for Different Prior Probabilities (KNN)')
    plt.show() 

In [80]:
#plot accuracy for diffrent K
def plot_k(trainingSet,testSet,prior0):
    K, acc = vary_k (trainingSet,testSet,prior0)   
    # plot accuracy with respect to k
    plt.figure(figsize=(6,5))
    line = plt.plot(K, acc,'ro-',lw = 2)
    plt.xlabel('k values')
    plt.ylabel('Accuracy')
    plt.title('Performance Curve for Different k values')
    plt.show()

In [81]:
#calculate sensitivity and specificity curve against different eigenvalues in PCA
def vary_eigVal(trainfile,testfile):
    trainingSet, testSet = loadDataset (trainfile,testfile)
    pri,predictions = KNN(trainingSet,testSet,11,132/200)
    perform = performance_measure(testSet, predictions)
    perf = performance(perform)  
    return perf

p1 = vary_eigVal('pX_PIMA_TR_0.01.txt','pX_PIMA_TE_0.01.txt')
p2 = vary_eigVal('pX_PIMA_TR_0.05.txt','pX_PIMA_TE_0.05.txt')
p3 = vary_eigVal('pX_PIMA_TR_0.1.txt','pX_PIMA_TE_0.1.txt')
p4 = vary_eigVal('pX_PIMA_TR_0.2.txt','pX_PIMA_TE_0.2.txt')
p5 = vary_eigVal('pX_PIMA_TR_0.4.txt','pX_PIMA_TE_0.4.txt')
p6 = vary_eigVal('pX_PIMA_TR_0.5.txt','pX_PIMA_TE_0.5.txt')
p7 = vary_eigVal('pX_PIMA_TR_0.7.txt','pX_PIMA_TE_0.7.txt')


In [82]:
#plot sensitivity and specificity curve against different eigenvalues in PCA
def plot_vary_eigVal(p1,p2,p3,p4,p5,p6,p7):
    e = [x for x in range(1,8)]
    sensitivity = [p7[0],p6[0],p5[0],p4[0],p3[0],p2[0],p1[0]]
    specificity = [p7[1],p6[1],p5[1],p4[1],p3[1],p2[1],p1[1]]
    
    line1 = plt.plot(e, sensitivity,'darkorange',lw = 2,label ='sensitivity')
    line2 = plt.plot(e, specificity,'b',lw = 2,label ='specificity')

    plt.xlabel('Number of Eigenvalues')
#     plt.ylabel('Accuracy')
    plt.legend(loc="lower right")
    plt.title('Sensitivity & Specificity Curve for Different Eigenvalues (KNN)')
    plt.show() 

K-fold validation on the training set

In [83]:
#K-fold validation on the training set
#split the training set into train and test 
def kfold_split(n_split, x_len):
    index = []
    test_size = int(x_len/n_split)
    train_size = x_len - test_size
    for i in range(n_split):
        j = i*test_size
        index.append([list(set(list(range(0,x_len))).difference(set(list(range(j,j+test_size)))) ),
                     list(range(j,j+test_size))])
    return index

index = kfold_split(10,200)
# print(b)

In [84]:
#get mean accuracy of all K-folds for diffrent k in KNN
def cross_validation_score(trainingSet, n_split,prior0):
    train = pd.DataFrame(trainingSet)
    col = len (train.columns)
    X = train.iloc[:,0:col].values
    index = kfold_split(n_split, len(X))
    acc = []
    mean_k = []
    
    for train_index, test_index in index:
        x = X[train_index]
        y = X[test_index]
        K, ac = vary_k(x,y,prior0)
        acc.append(ac) 
    for k in range(len(K)):
        avg = (acc[0][k]+acc[1][k]+acc[2][k]+acc[3][k]+acc[4][k])/5
        mean_k.append(avg)

    return mean_k


In [85]:
#plot validation accuracy with classification accuracy
def validation_plot(trainingSet,testSet,n_split,prior0):
    K, acc = vary_k (trainingSet,testSet,prior0)   
    true_rate = cross_validation_score(trainingSet, n_split,prior0)
    # plot accuracy with respect to k
    plt.figure(figsize=(6,5))
    line1 = plt.plot(K, acc,'ro-',lw = 2, label = 'classification accuracy')
    line2 = plt.plot(K, true_rate,'bo-',lw = 2, label = 'validation accuracy')
    plt.xlabel('k values')
    plt.ylabel('Accuracy')
    plt.legend (loc = 'lower right')
    plt.title('Performance Curve for Different k values')
    plt.show()
    


In [86]:
####check with built in classifier of sklearn for model accuracy
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import KFold # import KFold
# k_range=range(1,15)
# #define a list
# k_results=[]
# train = pd.DataFrame(trainingSet)
# col = len (train.columns)
# X = train.iloc[:,0:col-1].values
# y = train.iloc[:,col-1].values

# for k in k_range:
#     knn=KNeighborsClassifier(n_neighbors=k)
#     #K-Fold Cross Valdation
#     from sklearn.model_selection import cross_val_score
#     #define an object named 'results' tostore the accuracy scores
#     #no. of iterations=5
#     results=cross_val_score(knn,X,y,cv=5,scoring='accuracy')
#     #print(results)
#     #store the mean value kor k=1 to 15 ina list
#     k_results.append(results.mean())
# #print all mean values for k=1 to 15 
# print(k_results)


output cells

uncomment for output

In [87]:
# plot_vary_eigVal(p1,p2,p3,p4,p5,p6,p7)

In [92]:
# perf = perf_list (trainingSet,testSet,15)
# plot_roc (perf)

In [176]:
# perfp = perf_list (trainingSetp,testSetp,11)
# plot_roc (perfp)

In [93]:
# perff = perf_list (trainingSetf,testSetf,7)
# plot_roc (perff)

In [174]:
# plot_sens_spec(perf)

In [173]:
# plot_sens_spec(perfp)

In [94]:
# plot_sens_spec(perff)

In [171]:
# validation_plot(trainingSet,testSet,10,132/200)

In [170]:
# validation_plot(trainingSetp,testSetp,10,132/200)

In [95]:
# validation_plot(trainingSetf,testSetf,10,132/200)