**Name:** Luong Nguyen  
**Student ID:** 1504210
<h3>Cognitive Systems Mathematics and Methods  
Assignment 5: Vertebral abnormality</h3>
<hr>



In [0]:
# import required libraries
import pandas as pd
import pylab as pl
import numpy as np
import scipy.optimize as opt
from sklearn import preprocessing
%matplotlib inline 
import matplotlib.pyplot as plt

## Data preparation

### Import data

In [2]:
# download data
!wget -O data.zip http://archive.ics.uci.edu/ml/machine-learning-databases/00212/vertebral_column_data.zip

--2019-02-24 22:22:26--  http://archive.ics.uci.edu/ml/machine-learning-databases/00212/vertebral_column_data.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.249
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.249|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33251 (32K) [application/zip]
Saving to: ‘data.zip’


2019-02-24 22:22:27 (512 KB/s) - ‘data.zip’ saved [33251/33251]



In [3]:
# extract zip file
!unzip data.zip

Archive:  data.zip
  inflating: column_2C.dat           
  inflating: column_2C_weka.arff     
  inflating: column_3C.dat           
  inflating: column_3C_weka.arff     


In [4]:
# load data from arff file
from scipy.io import arff

df = arff.loadarff('column_2C_weka.arff')
df = pd.DataFrame(df[0])

df.head().transpose()

Unnamed: 0,0,1,2,3,4
pelvic_incidence,63.0278,39.057,68.832,69.297,49.7129
pelvic_tilt,22.5526,10.061,22.2185,24.6529,9.65207
lumbar_lordosis_angle,39.6091,25.0154,50.0922,44.3112,28.3174
sacral_slope,40.4752,28.996,46.6135,44.6441,40.0608
pelvic_radius,98.6729,114.405,105.985,101.868,108.169
degree_spondylolisthesis,-0.2544,4.56426,-3.53032,11.2115,7.9185
class,b'Abnormal',b'Abnormal',b'Abnormal',b'Abnormal',b'Abnormal'


### Preprocess data

In [5]:
# re-encode class column
df['class'].replace([b'Normal', b'Abnormal'], [0,1], inplace=True)
df['class'] = df['class'].astype('int')

df.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.027817,22.552586,39.609117,40.475232,98.672917,-0.2544,1
1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259,1
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,1
3,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523,1
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,1


In [6]:
# define X, y
X, y = df.drop(columns=['class']), df['class']
print('X: feature matrix')
X.head()


X: feature matrix


Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
0,63.027817,22.552586,39.609117,40.475232,98.672917,-0.2544
1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317
3,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501


In [7]:
print('\ny: target vector')
y.head()


y: target vector


0    1
1    1
2    1
3    1
4    1
Name: class, dtype: int64

In [8]:
# normalize data
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[ 0.14708636,  0.50136873, -0.6651769 , -0.18495031, -1.4476468 ,
        -0.70805942],
       [-1.24586434, -0.74876898, -1.45300075, -1.0415207 , -0.26438488,
        -0.57955637],
       [ 0.4843695 ,  0.46793218, -0.09926175,  0.2730833 , -0.89768556,
        -0.79542095],
       [ 0.51138997,  0.71156241, -0.41133905,  0.12612823, -1.2073033 ,
        -0.40228841],
       [-0.62664788, -0.78969266, -1.27474511, -0.21587588, -0.73345503,
        -0.4901061 ]])

In [9]:
# train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print('Train set:', X_train.shape,  y_train.shape)
print('Test set:', X_test.shape,  y_test.shape)

Train set: (248, 6) (248,)
Test set: (62, 6) (62,)


## Modelling

In [10]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(solver='liblinear').fit(X_train,y_train)
LR

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
yhat = LR.predict(X_test)
yhat[:10]

array([1, 0, 1, 0, 1, 0, 1, 1, 1, 1])

## Evaluation

### jaccard index

In [12]:
# size of the intersection divided by the size of the union of two label sets
from sklearn.metrics import jaccard_similarity_score
jaccard_similarity_score(y_test, yhat)

0.8548387096774194

### confusion matrix

In [13]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, yhat, labels=[1,0]))
print(classification_report(y_test, yhat))

[[37  4]
 [ 5 16]]
              precision    recall  f1-score   support

           0       0.80      0.76      0.78        21
           1       0.88      0.90      0.89        41

   micro avg       0.85      0.85      0.85        62
   macro avg       0.84      0.83      0.84        62
weighted avg       0.85      0.85      0.85        62



`the average accuracy for this classifier (the average of the f1-score for both labels) is 0.85`