#### Download dataset on https://archive.ics.uci.edu/ml/datasets/Parkinsons+Telemonitoring

This dataset is composed of a range of biomedical voice measurements from 42 people with early-stage Parkinson's disease recruited to a six-month trial of a telemonitoring device for remote symptom progression monitoring. The recordings were automatically captured in the patient's homes.

Columns in the table contain subject number, subject age, subject gender, time interval from baseline recruitment date, motor UPDRS, total UPDRS, and 16 biomedical voice measures. Each row corresponds to one of 5,875 voice recording from these individuals. The main aim of the data is to predict the motor and total UPDRS scores ('motor_UPDRS' and 'total_UPDRS') from the 16 voice measures.

The data is in ASCII CSV format. The rows of the CSV file contain an instance corresponding to one voice recording. There are around 200 recordings per patient, the subject number of the patient is identified in the first column. For further information or to pass on comments, please contact Athanasios Tsanas (tsanasthanasis '@' gmail.com) or Max Little (littlem '@' physics.ox.ac.uk).

## Load data into a Pandas dataframe

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/parkinsons_updrs.data')

In [3]:
df

Unnamed: 0,subject#,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,...,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,1,72,0,5.6431,28.199,34.398,0.00662,0.000034,0.00401,0.00317,...,0.230,0.01438,0.01309,0.01662,0.04314,0.014290,21.640,0.41888,0.54842,0.16006
1,1,72,0,12.6660,28.447,34.894,0.00300,0.000017,0.00132,0.00150,...,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.10810
2,1,72,0,19.6810,28.695,35.389,0.00481,0.000025,0.00205,0.00208,...,0.181,0.00734,0.00844,0.01458,0.02202,0.020220,23.047,0.46222,0.54405,0.21014
3,1,72,0,25.6470,28.905,35.810,0.00528,0.000027,0.00191,0.00264,...,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.48730,0.57794,0.33277
4,1,72,0,33.6420,29.187,36.375,0.00335,0.000020,0.00093,0.00130,...,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5870,42,61,0,142.7900,22.485,33.485,0.00406,0.000031,0.00167,0.00168,...,0.160,0.00973,0.01133,0.01549,0.02920,0.025137,22.369,0.64215,0.55314,0.21367
5871,42,61,0,149.8400,21.988,32.988,0.00297,0.000025,0.00119,0.00147,...,0.215,0.01052,0.01277,0.01904,0.03157,0.011927,22.886,0.52598,0.56518,0.12621
5872,42,61,0,156.8200,21.495,32.495,0.00349,0.000025,0.00152,0.00187,...,0.244,0.01371,0.01456,0.01877,0.04112,0.017701,25.065,0.47792,0.57888,0.14157
5873,42,61,0,163.7300,21.007,32.007,0.00281,0.000020,0.00128,0.00151,...,0.131,0.00693,0.00870,0.01307,0.02078,0.007984,24.422,0.56865,0.56327,0.14204


### Get only dependent variable data and set to Y vector

In [5]:
df['motor_UPDRS']

0       28.199
1       28.447
2       28.695
3       28.905
4       29.187
         ...  
5870    22.485
5871    21.988
5872    21.495
5873    21.007
5874    20.513
Name: motor_UPDRS, Length: 5875, dtype: float64

In [7]:
Y = df['motor_UPDRS'].to_numpy()

### Get independent variables and set to X matrix

In [4]:
df[ df.columns[~df.columns.isin(['motor_UPDRS','total_UPDRS','subject#']) ]]

Unnamed: 0,age,sex,test_time,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,72,0,5.6431,0.00662,0.000034,0.00401,0.00317,0.01204,0.02565,0.230,0.01438,0.01309,0.01662,0.04314,0.014290,21.640,0.41888,0.54842,0.16006
1,72,0,12.6660,0.00300,0.000017,0.00132,0.00150,0.00395,0.02024,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.10810
2,72,0,19.6810,0.00481,0.000025,0.00205,0.00208,0.00616,0.01675,0.181,0.00734,0.00844,0.01458,0.02202,0.020220,23.047,0.46222,0.54405,0.21014
3,72,0,25.6470,0.00528,0.000027,0.00191,0.00264,0.00573,0.02309,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.48730,0.57794,0.33277
4,72,0,33.6420,0.00335,0.000020,0.00093,0.00130,0.00278,0.01703,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5870,61,0,142.7900,0.00406,0.000031,0.00167,0.00168,0.00500,0.01896,0.160,0.00973,0.01133,0.01549,0.02920,0.025137,22.369,0.64215,0.55314,0.21367
5871,61,0,149.8400,0.00297,0.000025,0.00119,0.00147,0.00358,0.02315,0.215,0.01052,0.01277,0.01904,0.03157,0.011927,22.886,0.52598,0.56518,0.12621
5872,61,0,156.8200,0.00349,0.000025,0.00152,0.00187,0.00456,0.02499,0.244,0.01371,0.01456,0.01877,0.04112,0.017701,25.065,0.47792,0.57888,0.14157
5873,61,0,163.7300,0.00281,0.000020,0.00128,0.00151,0.00383,0.01484,0.131,0.00693,0.00870,0.01307,0.02078,0.007984,24.422,0.56865,0.56327,0.14204


In [5]:
X = df[ df.columns[~df.columns.isin(['motor_UPDRS','total_UPDRS', 'subject#']) ]].to_numpy()

#### Some debuging to check variables

In [8]:
Y

array([28.199, 28.447, 28.695, ..., 21.495, 21.007, 20.513])

In [9]:
Y.shape

(5875,)

In [10]:
X

array([[7.2000e+01, 0.0000e+00, 5.6431e+00, ..., 4.1888e-01, 5.4842e-01,
        1.6006e-01],
       [7.2000e+01, 0.0000e+00, 1.2666e+01, ..., 4.3493e-01, 5.6477e-01,
        1.0810e-01],
       [7.2000e+01, 0.0000e+00, 1.9681e+01, ..., 4.6222e-01, 5.4405e-01,
        2.1014e-01],
       ...,
       [6.1000e+01, 0.0000e+00, 1.5682e+02, ..., 4.7792e-01, 5.7888e-01,
        1.4157e-01],
       [6.1000e+01, 0.0000e+00, 1.6373e+02, ..., 5.6865e-01, 5.6327e-01,
        1.4204e-01],
       [6.1000e+01, 0.0000e+00, 1.7073e+02, ..., 5.8608e-01, 5.7077e-01,
        1.5336e-01]])

In [11]:
X.shape

(5875, 19)

### Now the party starts! Let's train/fit a linear regression model

In [12]:
from sklearn import linear_model

In [13]:
reg = linear_model.LinearRegression()

In [14]:
reg.fit(X, Y)

In [15]:
reg.coef_

array([ 1.91823019e-01, -1.15665782e+00,  1.14285480e-02,  2.87212031e+02,
       -6.49458124e+04, -3.76183093e+04, -3.23242577e+02,  1.26597349e+04,
        1.45756423e+02, -6.71790672e+00, -8.77343441e+02, -1.23752943e+02,
        7.30292917e+01,  2.37579450e+02, -1.03473252e+01, -4.25630318e-01,
        6.31157107e-01, -2.35430543e+01,  1.75010694e+01])

### ... let's evaluation the prediction

In [16]:
reg.predict( [X[0]] )

array([24.78231469])

In [17]:
Y[0]

28.199

In [18]:
error = reg.predict( [X[0]] ) - Y[0]
print(error)

[-3.41668531]


In [19]:
predictions = reg.predict( X )

In [20]:
predictions

array([24.78231469, 21.23014935, 25.62778149, ..., 20.88323982,
       22.46509197, 22.95789264])

### Let's compute some metrics

In [21]:
from sklearn.metrics import mean_squared_error

In [22]:
mean_squared_error( Y, predictions)

55.712137806071

In [23]:
from sklearn.metrics import r2_score

In [24]:
r2_score( Y, predictions)

0.15682169262949452

### To avoid overfit, let's split the dataset into train and test splits

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [27]:
X.shape

(5875, 19)

In [28]:
X_train.shape

(3936, 19)

In [29]:
X_test.shape

(1939, 19)

In [30]:
reg.fit(X_train, y_train)

In [31]:
predictions = reg.predict( X_test )

In [32]:
r2_score( y_test, predictions)

0.15065448482784616

### But we should consider per-subject data

In [66]:
df_subjects = df.groupby(['subject#']).mean()

In [67]:
Y = df_subjects['motor_UPDRS'].to_numpy()

In [70]:
X = df_subjects[ df_subjects.columns[~df_subjects.columns.isin(['motor_UPDRS','total_UPDRS']) ]].to_numpy()

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=5, random_state=42)

In [76]:
reg.fit(X_train, y_train)

In [77]:
predictions = reg.predict( X_test )

In [79]:
predictions

array([16.92697364, 10.27480666, 12.79005998, 17.30303102, 19.03645592])

In [80]:
y_test

array([25.04024615, 13.01445   , 18.31236184, 10.79183566, 31.63260256])

In [78]:
r2_score( y_test, predictions)

-0.027013827762129017

In [81]:
mean_squared_error( y_test, predictions )

60.97704590707152