In [49]:
from scipy.io import arff
import numpy as np
import pandas as pd
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [42]:
#Import data
data = pd.read_csv('column_2C_weka_np.txt', 
                   delimiter=",", 
                   names=('pelvic_incidence','pelvic_tilt','lumbar_lordosis_angle','sacral_slope','pelvic_radius','degree_spondylolisthesis','class'))

# re-encode class column
data['class'].replace(['Abnormal','Normal'], [0,1], inplace=True)

data.head(10)

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.027818,22.552586,39.609117,40.475232,98.672917,-0.2544,0
1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259,0
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,0
3,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523,0
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,0
5,40.2502,13.921907,25.12495,26.328293,130.327871,2.230652,0
6,53.432928,15.864336,37.165934,37.568592,120.567523,5.988551,0
7,45.366754,10.755611,29.038349,34.611142,117.270068,-10.675871,0
8,43.79019,13.533753,42.690814,30.256437,125.002893,13.289018,0
9,36.686353,5.010884,41.948751,31.675469,84.241415,0.664437,0


In [43]:
data.describe()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
count,310.0,310.0,310.0,310.0,310.0,310.0,310.0
mean,60.496653,17.542822,51.93093,42.953831,117.920655,26.296694,0.322581
std,17.23652,10.00833,18.554064,13.423102,13.317377,37.559027,0.46822
min,26.147921,-6.554948,14.0,13.366931,70.082575,-11.058179,0.0
25%,46.430294,10.667069,37.0,33.347122,110.709196,1.603727,0.0
50%,58.691038,16.357689,49.562398,42.404912,118.268178,11.767934,0.0
75%,72.877696,22.120395,63.0,52.695888,125.467674,41.287352,1.0
max,129.834041,49.431864,125.742385,121.429566,163.071041,418.543082,1.0


In [46]:
X = data.iloc[:,:6]
Y = data.iloc[:,6]
Y.head()

0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64

In [47]:
# build and fit model
reg = LogisticRegression()
reg.fit(X,Y)

print("Coefficients: ",reg.coef_)
print("Intercept: ", reg.intercept_)

# compute predicted values from training set
Y_pred = reg.predict(X)

cm = confusion_matrix(Y, Y_pred)
print("Confusion matrix:\n",cm)

accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[1][1]+cm[0][1]+cm[1][0])
print("Accuracy calculated from the training set = %.3f" % (accuracy))

print(classification_report(Y, Y_pred, target_names=['no', 'yes']))

Coefficients:  [[-0.02704089 -0.085594    0.02888813  0.0585531   0.01780271 -0.15320162]]
Intercept:  [-1.96490609]
Confusion matrix:
 [[181  29]
 [ 23  77]]
Accuracy calculated from the training set = 0.832
              precision    recall  f1-score   support

          no       0.89      0.86      0.87       210
         yes       0.73      0.77      0.75       100

   micro avg       0.83      0.83      0.83       310
   macro avg       0.81      0.82      0.81       310
weighted avg       0.84      0.83      0.83       310





In [50]:
# cross-validate
# number of folds
k = 10
scores = cross_val_score(estimator=reg,
                        X=X,
                        y=Y,
                        scoring="accuracy",
                        cv=k)
print("Accuracies from %d individual folds:" % k)
print(scores)
print("Accuracy calculated using %d-fold cross validation = %.3f" % (k, scores.mean()))

Accuracies from 10 individual folds:
[0.48387097 0.58064516 0.67741935 0.77419355 0.93548387 0.90322581
 0.93548387 1.         0.77419355 0.90322581]
Accuracy calculated using 10-fold cross validation = 0.797




In [66]:
# retrieve estimated probabilities (from training set)
reg.predict_proba(X)

#X arvoilla ensimäisellä henkilöllä on 57% mahdollisuus olla "abnormal",eli hänellä on jokin tauti jne

array([[5.71946568e-01, 4.28053432e-01],
       [5.31029103e-01, 4.68970897e-01],
       [2.93975107e-01, 7.06024893e-01],
       [8.76410078e-01, 1.23589922e-01],
       [5.64404426e-01, 4.35595574e-01],
       [4.99768404e-01, 5.00231596e-01],
       [5.65935675e-01, 4.34064325e-01],
       [7.75250434e-02, 9.22474957e-01],
       [7.52703478e-01, 2.47296522e-01],
       [2.53769007e-01, 7.46230993e-01],
       [1.46787545e-01, 8.53212455e-01],
       [7.36115238e-01, 2.63884762e-01],
       [7.76251304e-01, 2.23748696e-01],
       [7.97972183e-01, 2.02027817e-01],
       [7.48814048e-01, 2.51185952e-01],
       [4.88233226e-01, 5.11766774e-01],
       [3.13849915e-01, 6.86150085e-01],
       [2.22741836e-01, 7.77258164e-01],
       [4.53714030e-01, 5.46285970e-01],
       [3.27375643e-01, 6.72624357e-01],
       [5.31516493e-01, 4.68483507e-01],
       [5.48550390e-01, 4.51449610e-01],
       [9.21244059e-01, 7.87559414e-02],
       [2.09835637e-01, 7.90164363e-01],
       [6.716755

In [67]:
reg.predict_proba(X)[309]

array([0.15899827, 0.84100173])