# Import Data and Necessary Packages

In [6]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, math

In [7]:
train = pd.read_csv('spam_train.csv')
f_train = train.iloc[:,:57]
t_train = train.iloc[:,-1]

In [8]:
test = pd.read_csv('spam_test.csv', index_col = 0)
f_test = test.iloc[:,:57]
t_test = test.iloc[:,-1]

### Z-Score Normalized Features

In [9]:
norm_f_train = (f_train - f_train.mean()) / f_train.std()
norm_f_test = (f_test - f_train.mean()) / f_train.std()

# Functions

### Accuracy

In [10]:
def KNN_accuracy(pred_X, test_data_y):
    
    """
    Takes in predicted and actual values for test instances.
    Returns an accuracy score.
    """
    
    test_target = np.array(t_test)
    pred_target = np.array(pred_X)
    
    accuracy_score = 0
    
    for i in range(len(pred_X)):
        if test_target[i] == pred_target[i]:
            accuracy_score = accuracy_score + 1
        else:
            accuracy_score = accuracy_score + 0
        
    return (accuracy_score / len(pred_X))

### Majority Vote

In [11]:
def majority_vote(categories_list):
    
    """
    Takes in a list of votes of length k.
    Returns the value that consitutes the majority.
    """
    
    ones = categories_list.count(1)
    zeroes = categories_list.count(0)

    if ones > zeroes:
        result = 1
    else:
        result = 0
    
    return result

### Predict Instance

In [12]:
def predict_instance(X_train, y_train, test_instance, k, n):
    
    """
    Takes in training features, training targets, testing features,
    a value for k, and a value for n, an index value 
    that determines the testing instance against which 
    the training instances are measured in terms of Eucledian distance.
    
    Sorts the list of distances in ascending order,
    keeping only the first k of the them in a new list,
    with which it computes a majority vote. 
    
    Returns the result of the majority vote. 
    """
    
    distances = []
    targets = []
    
    # Get the distances for one instance
    test = np.array(y_train)
    train = np.array(X_train)
    instance = np.array(test_instance)
    
    #1st for loop
    for i in range(len(train)):
    
        dist = np.sqrt(np.sum(np.square(instance[n] - train[i])))
        distances.append([dist,i])
     
    distances = sorted(distances)
    
    #2nd for loop
    for i in range(k):
        index = distances[i][1]
        targets.append(y_train[index])
    
    # majority vote
    return majority_vote(targets)


# KNN Classifier

In [13]:
def KNN_Classifier(X_train, y_train, X_test, k):
    
    """
    Returns an array of predicted values for a testing 
    """
    
    pred_array = []
    for x in range(len(X_test)):
        val = predict_instance(X_train, t_train, X_test, k, x)
        pred_array.append(val)
    
    return pred_array

# Questions 1.a.b

# $K = 1$

In [14]:
pred_K1 = KNN_Classifier(f_train, t_train, f_test, 1)

In [15]:
KNN_accuracy(pred_K1, t_test)

0.7522816166883963

### Normalized

In [16]:
norm_K1 = KNN_Classifier(norm_f_train, t_train, norm_f_test, 1)

In [17]:
KNN_accuracy(norm_K1, t_test)

0.8561495002172969

# $K = 5$

In [18]:
pred_K5 = KNN_Classifier(f_train, t_train, f_test, 5)

In [19]:
KNN_accuracy(pred_K5, t_test)

0.7548891786179922

### Normalized $K = 5$

In [20]:
norm_K5 = KNN_Classifier(norm_f_train, t_train, norm_f_test, 5)

In [21]:
KNN_accuracy(norm_K5, t_test)

0.8700564971751412

# $K = 11$

In [22]:
pred_K11 = KNN_Classifier(f_train, t_train, f_test, 11)

In [23]:
KNN_accuracy(pred_K11, t_test)

0.7648848326814428

### Normalized $K = 11$

In [24]:
norm_K11 = KNN_Classifier(norm_f_train, t_train, norm_f_test, 11)

In [25]:
KNN_accuracy(norm_K11, t_test)

0.8791829639287266

# $K = 21$

In [26]:
pred_K21 = KNN_Classifier(f_train, t_train, f_test, 21)

In [27]:
KNN_accuracy(pred_K21, t_test)

0.7466318991742721

### Normalized $K = 21$

In [28]:
norm_K21 = KNN_Classifier(norm_f_train, t_train, norm_f_test, 21)

In [29]:
KNN_accuracy(norm_K21, t_test)

0.8843980877879183

# $K = 41$

In [30]:
pred_K41 = KNN_Classifier(f_train, t_train, f_test, 41)

In [31]:
KNN_accuracy(pred_K41, t_test)

0.7522816166883963

### Normalized $K = 41$

In [32]:
norm_K41 = KNN_Classifier(norm_f_train, t_train, norm_f_test, 41)

In [33]:
KNN_accuracy(norm_K41, t_test)

0.8852672750977836

# $K = 61$

In [34]:
pred_K61 = KNN_Classifier(f_train, t_train, f_test, 61)

In [35]:
KNN_accuracy(pred_K61, t_test)

0.7375054324206867

### Normalized $K = 61$

In [36]:
norm_K61 = KNN_Classifier(norm_f_train, t_train, norm_f_test, 61)

In [37]:
KNN_accuracy(norm_K61, t_test)

0.8826597131681877

# $K = 81$

In [38]:
pred_K81 = KNN_Classifier(f_train, t_train, f_test, 81)

In [39]:
KNN_accuracy(pred_K81, t_test)

0.7266405910473707

### Normalized $K = 81$

In [40]:
norm_K81 = KNN_Classifier(norm_f_train, t_train, norm_f_test, 81)

In [41]:
KNN_accuracy(norm_K81, t_test)

0.877444589308996

# $K = 101$

In [42]:
pred_K101 = KNN_Classifier(f_train, t_train, f_test, 101)

In [43]:
KNN_accuracy(pred_K101, t_test)

0.7288135593220338

### Normalized $K = 101$

In [44]:
norm_K101 = KNN_Classifier(norm_f_train, t_train, norm_f_test, 101)

In [45]:
KNN_accuracy(norm_K101, t_test)

0.8752716210343329

# $K = 201$

In [46]:
pred_K201 = KNN_Classifier(f_train, t_train, f_test, 201)

In [47]:
KNN_accuracy(pred_K201, t_test)

0.7314211212516297

### Normalized $K = 201$

In [48]:
norm_K201 = KNN_Classifier(norm_f_train, t_train, norm_f_test, 201)

In [49]:
KNN_accuracy(norm_K201, t_test)

0.8600608431116906

# $K = 401$

In [50]:
pred_K401 = KNN_Classifier(f_train, t_train, f_test, 401)

In [51]:
KNN_accuracy(pred_K401, t_test)

0.7196870925684485

### Normalized $K = 401$

In [52]:
norm_K401 = KNN_Classifier(norm_f_train, t_train, norm_f_test, 401)

In [53]:
KNN_accuracy(norm_K401, t_test)

0.8396349413298566

# Question 1.c

In [58]:
idslist = list()
for i in range(50+1):
    idslist.append("t" + str(i))
    
ids = idslist[1:]

In [59]:
k1 = np.array(norm_K1[:50])
k5 = np.array(norm_K5[:50])
k11 = np.array(norm_K11[:50])
k21 = np.array(norm_K21[:50])
k41 = np.array(norm_K41[:50])
k61 = np.array(norm_K61[:50])
k81 = np.array(norm_K81[:50])
k101 = np.array(norm_K101[:50])
k201 = np.array(norm_K201[:50])
k401 = np.array(norm_K401[:50])

In [60]:
k1 = np.where(k1 == 1, 'spam', 'no')
k5 = np.where(k5 == 1, 'spam', 'no')
k11 = np.where(k11 == 1, 'spam', 'no')
k21 = np.where(k21 == 1, 'spam', 'no')
k41 = np.where(k41 == 1, 'spam', 'no')
k61 = np.where(k61 == 1, 'spam', 'no')
k81 = np.where(k81 == 1, 'spam', 'no')
k101 = np.where(k101 == 1, 'spam', 'no')
k201 = np.where(k201 == 1, 'spam', 'no')
k401 = np.where(k401 == 1, 'spam', 'no')

# Output for 1.c

In [61]:
for i in range(50):
    print(ids[i], k1[i], k5[i], k11[i], k21[i], k41[i], k61[i], k81[i],
         k101[i], k201[i], k401[i])

t1 spam spam spam spam spam spam spam spam no no
t2 spam spam spam spam spam spam spam spam no no
t3 spam spam spam spam spam spam spam spam spam spam
t4 spam spam spam spam spam spam spam spam spam spam
t5 spam spam spam spam spam spam spam spam spam spam
t6 spam spam no spam no no no no spam spam
t7 spam no no no no no no no no no
t8 spam spam spam spam spam spam spam spam spam spam
t9 spam spam spam spam spam spam spam spam spam spam
t10 spam spam spam spam spam spam spam spam spam spam
t11 spam spam spam spam spam spam spam spam spam spam
t12 spam spam spam spam spam spam spam spam spam spam
t13 spam spam spam spam spam spam spam no no no
t14 spam spam spam spam spam spam spam spam no no
t15 spam spam spam spam spam spam spam spam spam spam
t16 spam spam spam spam spam spam spam spam spam spam
t17 spam spam spam spam spam spam spam spam spam spam
t18 spam spam spam spam spam spam spam spam spam no
t19 spam spam spam spam spam spam spam spam spam spam
t20 no spam spam spam spam spam