In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from tabulate import tabulate
import matplotlib.pyplot as plt
from scipy.io import arff
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, precision_score, f1_score

# Question (1)(a)

In [2]:
# Load arff file as pandas dataset
data = arff.loadarff('../data/vertebral_column_data/column_2C_weka.arff')
df = pd.DataFrame(data[0])
df['class'] = df['class'].str.decode('utf-8')

# Delete rows where class != Normal, Abnormal
df = df[~(df['class'].isin(['Hernia', 'Spondylolisthesis']))]
# print(df['class'].unique())

df.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.027817,22.552586,39.609117,40.475232,98.672917,-0.2544,Abnormal
1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259,Abnormal
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,Abnormal
3,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523,Abnormal
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,Abnormal


In [3]:
# Print data summary
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 310 entries, 0 to 309
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   pelvic_incidence          310 non-null    float64
 1   pelvic_tilt               310 non-null    float64
 2   lumbar_lordosis_angle     310 non-null    float64
 3   sacral_slope              310 non-null    float64
 4   pelvic_radius             310 non-null    float64
 5   degree_spondylolisthesis  310 non-null    float64
 6   class                     310 non-null    object 
dtypes: float64(6), object(1)
memory usage: 19.4+ KB


Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
count,310.0,310.0,310.0,310.0,310.0,310.0
mean,60.496653,17.542822,51.93093,42.953831,117.920655,26.296694
std,17.23652,10.00833,18.554064,13.423102,13.317377,37.559027
min,26.147921,-6.554948,14.0,13.366931,70.082575,-11.058179
25%,46.430294,10.667069,37.0,33.347122,110.709196,1.603727
50%,58.691038,16.357689,49.562398,42.404912,118.268178,11.767934
75%,72.877696,22.120395,63.0,52.695888,125.467674,41.287352
max,129.834041,49.431864,125.742385,121.429566,163.071041,418.543082


In [4]:
# Split data into features and class
X = df.iloc[:, :6].to_numpy()
y = df['class'].to_numpy()

# Endode class labels with 0 and 1 and convert object to int
y[y == 'Normal'] = 0
y[y == 'Abnormal'] = 1
y = y.astype(int)

# Question (1)(b)(i)

In [None]:
# Pairwise scatterplots
sns.pairplot(df, hue='class')

# Question (1)(b)(ii)

In [None]:
sns.boxplot(x=df['class'], y=df['pelvic_incidence'], hue=df['class'])

In [None]:
sns.boxplot(x=df['class'], y=df['pelvic_tilt'], hue=df['class'])

In [None]:
sns.boxplot(x=df['class'], y=df['lumbar_lordosis_angle'], hue=df['class'])

In [None]:
sns.boxplot(x=df['class'], y=df['sacral_slope'], hue=df['class'])

In [None]:
sns.boxplot(x=df['class'], y=df['pelvic_radius'], hue=df['class'])

In [None]:
sns.boxplot(x=df['class'], y=df['degree_spondylolisthesis'], hue=df['class'])

# Question (1)(b)(iii)

In [None]:
zeros = df[df['class'] == 0]
ones = df[df['class'] == 1]

X_train = pd.concat([pd.DataFrame(zeros.iloc[:70, :6]), pd.DataFrame(ones.iloc[:140, :6])]).to_numpy()
X_test = pd.concat([pd.DataFrame(zeros.iloc[70:, :6]), pd.DataFrame(ones.iloc[140:, :6])]).to_numpy()
y_train = pd.concat([pd.DataFrame(zeros['class'].iloc[:70]), pd.DataFrame(ones['class'].iloc[:140])]).to_numpy().ravel().astype(int)
y_test = pd.concat([pd.DataFrame(zeros['class'].iloc[70:]), pd.DataFrame(ones['class'].iloc[140:])]).to_numpy().ravel().astype(int)
            
# print(X_train)
# print(X_test)
# print(y_train)
# print(y_test)

# Question (1)(c)(i)(ii)

In [None]:
# Fit the classifier to the training data
from sklearn.metrics import precision_recall_fscore_support
train_accuracies = []
test_accuracies = []
train_error_rate = []
test_error_rate = []

for k in range(208, 0, -3):
    classifier = KNeighborsClassifier(n_neighbors=k)
    classifier.fit(X_train, y_train)
    
    y_train_pred = classifier.predict(X_train)
    train_accuracies.append(accuracy_score(y_train, y_train_pred))
    train_error_rate.append([k, round(1 - accuracy_score(y_train, y_train_pred), 3)])
    
    y_test_pred = classifier.predict(X_test)
    test_accuracies.append(accuracy_score(y_test, y_test_pred))
    test_error_rate.append([k, round(1 - accuracy_score(y_test, y_test_pred), 3)])

# print(train_accuracies)
# print(test_accuracies)
# print(train_error_rate)
# print(test_error_rate)

In [None]:
print('[K*, Error Rate]')
print('Train:', train_error_rate[np.argmax(train_accuracies)])
print('Test: ', test_error_rate[np.argmax(test_accuracies)])

In [None]:
train_error_rate_df = pd.DataFrame(train_error_rate)
train_error_rate_df.columns = ['K', 'Error Rate']
test_error_rate_df = pd.DataFrame(test_error_rate)
test_error_rate_df.columns = ['K', 'Error Rate']
sns.lineplot(data=train_error_rate_df, x='K', y='Error Rate') # train error-rate: blue
sns.lineplot(data=test_error_rate_df, x='K', y='Error Rate')  # test error-rate: orange

In [None]:
classifier = KNeighborsClassifier(n_neighbors=4)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
CM = confusion_matrix(y_test, y_pred)
TP = CM[1][1]
TN = CM[0][0]
print('Confusion Matrix:')
print(CM[0])
print(CM[1])
print('True Positive Rate:', TP)
print('True Negative Rate:', TN)
print('Precision Score:', precision_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred, average=None))

# Question (1)(c)(iii)

In [None]:
plot = []
for N in range(10, 220, 10):
    accuracies = []
    error_rate = []
    zero_train_size = round(N/3) # floor/truncate
    one_train_size = N - zero_train_size
    X_train = pd.concat([pd.DataFrame(zeros.iloc[:zero_train_size, :6]), pd.DataFrame(ones.iloc[:one_train_size, :6])]).to_numpy()
    X_test = pd.concat([pd.DataFrame(zeros.iloc[zero_train_size:, :6]), pd.DataFrame(ones.iloc[one_train_size:, :6])]).to_numpy()
    y_train = pd.concat([pd.DataFrame(zeros['class'].iloc[:zero_train_size]), pd.DataFrame(ones['class'].iloc[:one_train_size])]).to_numpy().ravel().astype(int)
    y_test = pd.concat([pd.DataFrame(zeros['class'].iloc[zero_train_size:]), pd.DataFrame(ones['class'].iloc[one_train_size:])]).to_numpy().ravel().astype(int)
    
    for k in range(1, N, 5):
        classifier = KNeighborsClassifier(n_neighbors=k)
        classifier.fit(X_train, y_train)
        
        y_test_pred = classifier.predict(X_test)
        accuracies.append(accuracy_score(y_test, y_test_pred))
        error_rate.append([k, round(1 - accuracy_score(y_test, y_test_pred), 3)])
    
    # print(error_rate)
    # print('N = ', N, error_rate[np.argmax(accuracies)][1])
    plot.append([N, error_rate[np.argmax(accuracies)][1]])

best_test_error_rate = pd.DataFrame(plot)
best_test_error_rate.columns = ['N', 'Error Rate']
sns.lineplot(data=best_test_error_rate, x='N', y='Error Rate')

# Question (1)(d)(i)(A)

In [None]:
N = 200
accuracies = []
error_rate = []
zero_train_size = round(N/3) # floor/truncate
one_train_size = N - zero_train_size
X_train = pd.concat([pd.DataFrame(zeros.iloc[:zero_train_size, :6]), pd.DataFrame(ones.iloc[:one_train_size, :6])]).to_numpy()
X_test = pd.concat([pd.DataFrame(zeros.iloc[zero_train_size:, :6]), pd.DataFrame(ones.iloc[one_train_size:, :6])]).to_numpy()
y_train = pd.concat([pd.DataFrame(zeros['class'].iloc[:zero_train_size]), pd.DataFrame(ones['class'].iloc[:one_train_size])]).to_numpy().ravel().astype(int)
y_test = pd.concat([pd.DataFrame(zeros['class'].iloc[zero_train_size:]), pd.DataFrame(ones['class'].iloc[one_train_size:])]).to_numpy().ravel().astype(int)

for k in range(1, N, 5):
    classifier = KNeighborsClassifier(n_neighbors=k, p=1)
    classifier.fit(X_train, y_train)

    y_test_pred = classifier.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_test_pred))
    error_rate.append([k, round(1 - accuracy_score(y_test, y_test_pred), 3)])

print('[K*, Test Error Rate]')
print(error_rate[np.argmax(accuracies)])
# use k=6 since it has same error rate as k=1

# Question (1)(d)(i)(B)

In [None]:
k = 6
N = 200
accuracies = []
error_rate = []
zero_train_size = round(N/3) # floor/truncate
one_train_size = N - zero_train_size
X_train = pd.concat([pd.DataFrame(zeros.iloc[:zero_train_size, :6]), pd.DataFrame(ones.iloc[:one_train_size, :6])]).to_numpy()
X_test = pd.concat([pd.DataFrame(zeros.iloc[zero_train_size:, :6]), pd.DataFrame(ones.iloc[one_train_size:, :6])]).to_numpy()
y_train = pd.concat([pd.DataFrame(zeros['class'].iloc[:zero_train_size]), pd.DataFrame(ones['class'].iloc[:one_train_size])]).to_numpy().ravel().astype(int)
y_test = pd.concat([pd.DataFrame(zeros['class'].iloc[zero_train_size:]), pd.DataFrame(ones['class'].iloc[one_train_size:])]).to_numpy().ravel().astype(int)

for p in range(1, 11, 1):
    p = p/10
    classifier = KNeighborsClassifier(n_neighbors=k, p=10**p)
    classifier.fit(X_train, y_train)

    y_test_pred = classifier.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_test_pred))
    error_rate.append([p, round(1 - accuracy_score(y_test, y_test_pred), 3)])

print('[log10(p)*, Test Error Rate]')
print(error_rate[np.argmax(accuracies)])

# Question (1)(d)(i)(C)

In [None]:
N = 200
accuracies = []
error_rate = []
zero_train_size = round(N/3) # floor/truncate
one_train_size = N - zero_train_size
X_train = pd.concat([pd.DataFrame(zeros.iloc[:zero_train_size, :6]), pd.DataFrame(ones.iloc[:one_train_size, :6])]).to_numpy()
X_test = pd.concat([pd.DataFrame(zeros.iloc[zero_train_size:, :6]), pd.DataFrame(ones.iloc[one_train_size:, :6])]).to_numpy()
y_train = pd.concat([pd.DataFrame(zeros['class'].iloc[:zero_train_size]), pd.DataFrame(ones['class'].iloc[:one_train_size])]).to_numpy().ravel().astype(int)
y_test = pd.concat([pd.DataFrame(zeros['class'].iloc[zero_train_size:]), pd.DataFrame(ones['class'].iloc[one_train_size:])]).to_numpy().ravel().astype(int)

for k in range(1, N, 5):
    classifier = KNeighborsClassifier(n_neighbors=k, metric='chebyshev')
    classifier.fit(X_train, y_train)

    y_test_pred = classifier.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_test_pred))
    error_rate.append([k, round(1 - accuracy_score(y_test, y_test_pred), 3)])

print('[K*, Test Error Rate]')
print(error_rate[np.argmax(accuracies)])

# Question (1)(d)(ii)

In [None]:
N = 200
accuracies = []
error_rate = []
zero_train_size = round(N/3) # floor/truncate
one_train_size = N - zero_train_size
X_train = pd.concat([pd.DataFrame(zeros.iloc[:zero_train_size, :6]), pd.DataFrame(ones.iloc[:one_train_size, :6])]).to_numpy()
X_test = pd.concat([pd.DataFrame(zeros.iloc[zero_train_size:, :6]), pd.DataFrame(ones.iloc[one_train_size:, :6])]).to_numpy()
y_train = pd.concat([pd.DataFrame(zeros['class'].iloc[:zero_train_size]), pd.DataFrame(ones['class'].iloc[:one_train_size])]).to_numpy().ravel().astype(int)
y_test = pd.concat([pd.DataFrame(zeros['class'].iloc[zero_train_size:]), pd.DataFrame(ones['class'].iloc[one_train_size:])]).to_numpy().ravel().astype(int)

for k in range(1, N, 5):
    classifier = KNeighborsClassifier(n_neighbors=k, metric='mahalanobis', metric_params={'VI': np.linalg.pinv(np.cov(X_train, rowvar=False))})
    classifier.fit(X_train, y_train)

    y_test_pred = classifier.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_test_pred))
    error_rate.append([k, round(1 - accuracy_score(y_test, y_test_pred), 3)])

print('[K*, Test Error Rate]')
print(error_rate[np.argmax(accuracies)])

In [None]:
table = [['Metric', 'K*', 'Test Error Rate'],
         ['Minkowski (p=1)', 6, 0.1],
         ['Minkowski (log10(p)=0.1)', 6, 0.091],
         ['Minkowski (log10(p)=0.2)', 6, 0.091],
         ['Minkowski (log10(p)=0.3)', 6, 0.082],
         ['Minkowski (log10(p)=0.4)', 6, 0.082],
         ['Minkowski (log10(p)=0.5)', 6, 0.073],
         ['Minkowski (log10(p)=0.6)', 6, 0.055],
         ['Minkowski (log10(p)=0.7)', 6, 0.064],
         ['Minkowski (log10(p)=0.8)', 6, 0.073],
         ['Minkowski (log10(p)=0.9)', 6, 0.082],
         ['Minkowski (log10(p)=1.0)', 6, 0.082],
         ['Chebyshev (p=inf)', 16, 0.082],
         ['Mahalanobis', 6, 0.136]]

print(tabulate(table, headers='firstrow'))

# Question (1)(e)

In [None]:
N = 200
accuracies = []
error_rate = []
zero_train_size = round(N/3) # floor/truncate
one_train_size = N - zero_train_size
X_train = pd.concat([pd.DataFrame(zeros.iloc[:zero_train_size, :6]), pd.DataFrame(ones.iloc[:one_train_size, :6])]).to_numpy()
X_test = pd.concat([pd.DataFrame(zeros.iloc[zero_train_size:, :6]), pd.DataFrame(ones.iloc[one_train_size:, :6])]).to_numpy()
y_train = pd.concat([pd.DataFrame(zeros['class'].iloc[:zero_train_size]), pd.DataFrame(ones['class'].iloc[:one_train_size])]).to_numpy().ravel().astype(int)
y_test = pd.concat([pd.DataFrame(zeros['class'].iloc[zero_train_size:]), pd.DataFrame(ones['class'].iloc[one_train_size:])]).to_numpy().ravel().astype(int)

for k in range(1, N, 5):
    classifier = KNeighborsClassifier(n_neighbors=k, weights='distance')
    classifier.fit(X_train, y_train)

    y_test_pred = classifier.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_test_pred))
    error_rate.append([k, round(1 - accuracy_score(y_test, y_test_pred), 3)])

print('Euclidean')
print('[K*, Test Error Rate]')
print(error_rate[np.argmax(accuracies)])

In [None]:
N = 200
accuracies = []
error_rate = []
zero_train_size = round(N/3) # floor/truncate
one_train_size = N - zero_train_size
X_train = pd.concat([pd.DataFrame(zeros.iloc[:zero_train_size, :6]), pd.DataFrame(ones.iloc[:one_train_size, :6])]).to_numpy()
X_test = pd.concat([pd.DataFrame(zeros.iloc[zero_train_size:, :6]), pd.DataFrame(ones.iloc[one_train_size:, :6])]).to_numpy()
y_train = pd.concat([pd.DataFrame(zeros['class'].iloc[:zero_train_size]), pd.DataFrame(ones['class'].iloc[:one_train_size])]).to_numpy().ravel().astype(int)
y_test = pd.concat([pd.DataFrame(zeros['class'].iloc[zero_train_size:]), pd.DataFrame(ones['class'].iloc[one_train_size:])]).to_numpy().ravel().astype(int)

for k in range(1, N, 5):
    classifier = KNeighborsClassifier(n_neighbors=k, p=1, weights='distance')
    classifier.fit(X_train, y_train)

    y_test_pred = classifier.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_test_pred))
    error_rate.append([k, round(1 - accuracy_score(y_test, y_test_pred), 3)])

print('Manhattan')
print('[K*, Test Error Rate]')
print(error_rate[np.argmax(accuracies)])

In [None]:
N = 200
accuracies = []
error_rate = []
zero_train_size = round(N/3) # floor/truncate
one_train_size = N - zero_train_size
X_train = pd.concat([pd.DataFrame(zeros.iloc[:zero_train_size, :6]), pd.DataFrame(ones.iloc[:one_train_size, :6])]).to_numpy()
X_test = pd.concat([pd.DataFrame(zeros.iloc[zero_train_size:, :6]), pd.DataFrame(ones.iloc[one_train_size:, :6])]).to_numpy()
y_train = pd.concat([pd.DataFrame(zeros['class'].iloc[:zero_train_size]), pd.DataFrame(ones['class'].iloc[:one_train_size])]).to_numpy().ravel().astype(int)
y_test = pd.concat([pd.DataFrame(zeros['class'].iloc[zero_train_size:]), pd.DataFrame(ones['class'].iloc[one_train_size:])]).to_numpy().ravel().astype(int)

for k in range(1, N, 5):
    classifier = KNeighborsClassifier(n_neighbors=k, metric='chebyshev', weights='distance')
    classifier.fit(X_train, y_train)

    y_test_pred = classifier.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_test_pred))
    error_rate.append([k, round(1 - accuracy_score(y_test, y_test_pred), 3)])

print('Chebyshev')
print('[K*, Test Error Rate]')
print(error_rate[np.argmax(accuracies)])

# Question (1)(e)

The lowest training error rate I achieved in this homework was 0 in part (1)(c)(ii) with k=1.