# Import data and libraries

In [1]:
import pandas as pd, numpy as np
from scipy.io import arff

In [2]:
data = arff.loadarff('veh-prime.arff')
df = pd.DataFrame(data[0])
df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f27,f28,f29,f30,f31,f32,f33,f34,f35,CLASS
0,0.063,0.16,0.509,-0.967,0.058,0.0,0.874,0.271,1.307,-0.011,...,-0.924,-0.077,0.108,-0.003,0.381,-0.314,0.929,0.184,-0.001,b'noncar'
1,-0.037,-0.325,-0.626,-0.029,0.121,-0.409,-0.002,-0.835,-0.595,-0.253,...,0.27,0.533,0.152,-0.978,0.157,0.011,-0.254,0.453,-0.621,b'noncar'
2,-0.0,1.253,0.833,-0.97,1.516,0.014,-0.378,1.197,0.546,-0.402,...,-0.408,1.55,0.01,-0.652,-0.403,-0.151,0.0,0.049,-0.113,b'car'
3,-0.743,-0.082,-0.626,0.723,-0.006,-0.0,-0.08,-0.297,0.166,0.311,...,0.819,-0.077,-0.099,-0.001,-0.291,1.633,0.686,1.528,-0.0,b'noncar'
4,-0.939,-1.054,-0.14,0.036,-0.766,0.0,-0.272,1.077,5.236,-0.366,...,0.676,0.533,-0.003,0.122,-0.179,-1.449,0.024,-1.698,0.083,b'noncar'


In [3]:
df.shape

(846, 37)

### Separate into X and y

In [4]:
X = df.iloc[:,:-1]
y_str = df.iloc[:,-1]
y = pd.get_dummies(y_str).iloc[:,:1]

y_clean = np.array(y.iloc[:,0])

len(y) == len(X) # check

True

# Z-score Normalization

In [5]:
X = (X - X.mean()) / X.std()

# Functions

### Pearson Coefficient

In [6]:
def pearson(x,y):
    
    sum_sq_x = 0
    sum_sq_y = 0 
    sum_coproduct = 0
    mean_x = 0
    mean_y = 0
    
    N = len(x)
    
    for i in range(N):
        sum_sq_x += x[i] * x[i]
        sum_sq_y += y[i] * y[i]
        sum_coproduct += x[i] * y[i]
        mean_x += x[i]
        mean_y += y[i]
        
    mean_x = mean_x / N
    mean_y = mean_y / N
    pop_sd_x = np.sqrt((sum_sq_x/N) - (mean_x * mean_x))
    pop_sd_y = np.sqrt((sum_sq_y / N) - (mean_y * mean_y))
    cov_x_y = (sum_coproduct / N) - (mean_x * mean_y)
    correlation = cov_x_y / (pop_sd_x * pop_sd_y)
    
    return correlation

### Majority Vote

In [7]:
def majority_vote(categories_list):
    
    """
    Takes in a list of votes of length k.
    Returns the value that consitutes the majority.
    """
    
    ones = categories_list.count(1)
    zeroes = categories_list.count(0)

    if ones > zeroes:
        result = 1
    else:
        result = 0
    
    return result

### Predict_Instance

In [8]:
def predict_instance(X_train, y_train, test_instance, k):
    
    """
    Takes in training features, training targets, testing features,
    a value for k, and a value for n, an index value 
    that determines the testing instance against which 
    the training instances are measured in terms of Eucledian distance.
    
    Sorts the list of distances in ascending order,
    keeping only the first k of the them in a new list,
    with which it computes a majority vote. 
    
    Returns the result of the majority vote. 
    """
    
    distances = []
    targets = []
    
    # Get the distances for one instance
    test = np.array(y_train)
    train = np.array(X_train)
    instance = np.array(test_instance)
    
    #1st for loop
    for i in range(len(train)):
    
        dist = np.sqrt(np.sum(np.square(instance - train[i]))) # removed [n] from after instance
        distances.append([dist,i])
     
    distances = sorted(distances)
    
    #2nd for loop
    for i in range(k):
        index = distances[i][1]
        targets.append(y_train[index])
    
    # majority vote
    return majority_vote(targets)


### Accuracy

In [9]:
def KNN_accuracy(pred_X, t_test):
    
    """
    Takes in predicted and actual values for test instances.
    Returns an accuracy score.
    """
    
    test_target = np.array(t_test)
    pred_target = np.array(pred_X)
    
    accuracy_score = 0
    
    for i in range(len(pred_X)):
        if test_target[i] == pred_target[i]:
            accuracy_score = accuracy_score + 1
        else:
            accuracy_score = accuracy_score + 0
        
    return (accuracy_score / len(pred_X))

# Question 3

### Get PCC scores for all features then sort them in descending order

In [10]:
a = y_clean
filt_feat = []

for i in range(X.shape[1]):
    b = np.array(X.iloc[:,i])
    val = np.abs(pearson(a,b)) # absolute value of R
    filt_feat.append([val,i])

filt_sort = sorted(filt_feat, reverse=True)

### Turn results into Data Frame to display 3.a)

In [11]:
rank = np.array(list(range(1,37)))
feat_n = []
r_score = []
for i in range(len(filt_sort)):
    feat_n.append(filt_sort[i][1])
    r_score.append(filt_sort[i][0])

In [12]:
Filter_Ranks = pd.DataFrame(data=np.column_stack((rank, feat_n, r_score)),columns=['Rank','Feature #','R_score'])
Filter_Ranks["Feature #"] = Filter_Ranks["Feature #"].astype(int)
Filter_Ranks["Rank"] = Filter_Ranks["Rank"].astype(int)

# Answer for 3.a.

In [13]:
Filter_Ranks

Unnamed: 0,Rank,Feature #,R_score
0,1,4,0.436922
1,2,13,0.368269
2,3,14,0.368224
3,4,16,0.366025
4,5,7,0.352141
5,6,22,0.35135
6,7,26,0.341043
7,8,1,0.308811
8,9,20,0.299049
9,10,31,0.290783


### Testing LOOCV

Next Step is how to put more than one feature in there 

### LOOCV

In [14]:
def LOOCV(X,y,f):
    
    '''f for feature number'''
    
    pred = []
    
    for i in range(len(y)):
        
        bad_ix = X.index.isin([i])
        
        X_swept = X[~bad_ix]
        y_swept = y[~bad_ix]
        
        y_clean = np.array(y.iloc[:,0])
        y_swept_clean = np.array(y_swept.iloc[:,0])

        Xtrain1 = X_swept.iloc[:,f] # z
        ytrain1 = y_swept_clean # all but z
        
        Xtest1 = X.iloc[i,f]
        ytest1 = y_clean[i] # all but z

        y_pred = predict_instance(Xtrain1, ytrain1, Xtest1, 7)
        
        pred.append(y_pred)
        
    return pred
    

### m = 1 LOOCV

In [15]:
m1 = LOOCV(X,y,4)
KNN_accuracy(m1,y_clean)

0.7222222222222222

### Iteratively

In [20]:
mlist = []
for i in range(1,36):
    m = LOOCV(X,y,Filter_Ranks["Feature #"][:i])
    a = KNN_accuracy(m,y_clean)
    mlist.append([a,i])

In [21]:
sorted(mlist, reverse=True)

[[0.925531914893617, 20],
 [0.9243498817966903, 18],
 [0.91725768321513, 19],
 [0.9113475177304965, 21],
 [0.9101654846335697, 17],
 [0.9089834515366431, 15],
 [0.9042553191489362, 22],
 [0.9042553191489362, 12],
 [0.9018912529550828, 11],
 [0.8971631205673759, 16],
 [0.8947990543735225, 23],
 [0.8947990543735225, 9],
 [0.8912529550827423, 14],
 [0.8888888888888888, 13],
 [0.8853427895981087, 10],
 [0.8841607565011821, 24],
 [0.8794326241134752, 26],
 [0.8782505910165485, 25],
 [0.875886524822695, 27],
 [0.8747044917257684, 8],
 [0.8699763593380615, 28],
 [0.8617021276595744, 7],
 [0.8546099290780141, 29],
 [0.8498817966903073, 30],
 [0.8380614657210402, 31],
 [0.8368794326241135, 6],
 [0.83451536643026, 32],
 [0.83451536643026, 5],
 [0.83451536643026, 4],
 [0.8274231678486997, 34],
 [0.8262411347517731, 33],
 [0.8226950354609929, 3],
 [0.8108747044917257, 35],
 [0.7955082742316785, 2],
 [0.7222222222222222, 1]]

# Answer 3.b.

### Optimal m = 20

In [22]:
m20 = LOOCV(X,y,Filter_Ranks["Feature #"][:20])
KNN_accuracy(m20,y_clean)

0.925531914893617