Use the Parkinson dataset for detectin Parkinson's disease.

In [None]:
import datetime, os
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data"
data=pd.read_csv(url, delimiter=',', encoding="utf-8")
pd.set_option('display.expand_frame_repr', False)
print(data.head())

             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  MDVP:Shimmer(dB)  Shimmer:APQ3  Shimmer:APQ5  MDVP:APQ  Shimmer:DDA      NHR     HNR  status      RPDE       DFA   spread1   spread2        D2       PPE
0  phon_R01_S01_1      119.992       157.302        74.997         0.00784           0.00007   0.00370   0.00554     0.01109       0.04374             0.426       0.02182       0.03130   0.02971      0.06545  0.02211  21.033       1  0.414783  0.815285 -4.813031  0.266482  2.301442  0.284654
1  phon_R01_S01_2      122.400       148.650       113.819         0.00968           0.00008   0.00465   0.00696     0.01394       0.06134             0.626       0.03134       0.04518   0.04368      0.09403  0.01929  19.085       1  0.458359  0.819521 -4.075192  0.335590  2.486855  0.368674
2  phon_R01_S01_3      116.682       131.111       111.555         0.01050           0.00009   0.00544   0.00781     0.01

In [None]:
cols_with_missing = [col for col in data.columns if data[col].isnull().any()]
print(cols_with_missing)
missing_val_count_by_column = (data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

[]
Series([], dtype: int64)


In [None]:
label = data["status"]
data.drop(["status", "name"], axis = 1, inplace=True)
target_count = label.value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1')

data, label =shuffle(data, label, random_state = 1)

Class 0: 48
Class 1: 147
Proportion: 0.33 : 1


In [None]:
smote = SMOTE()

data, label = smote.fit_resample(data, label)
target_count = label.value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1')

data, label =shuffle(data, label, random_state = 2)

Class 0: 147
Class 1: 147
Proportion: 1.0 : 1


In [None]:
correlations = data[data.columns].corr()
print('Absolute overall correlations')
print('-' * 30)
correlations_abs_sum = correlations[correlations.columns].abs().sum()
print('Weakest correlations')
print('-' * 30)
print(correlations_abs_sum.nsmallest(5))

data.drop(["MDVP:Fhi(Hz)"], axis = 1, inplace=True)

Absolute overall correlations
------------------------------
Weakest correlations
------------------------------
MDVP:Fhi(Hz)    3.544941
DFA             5.546412
MDVP:Flo(Hz)    6.745387
MDVP:Fo(Hz)     7.113839
RPDE            9.283083
dtype: float64


In [None]:
#scaler=MinMaxScaler((-1,1))
#data=scaler.fit_transform(data)

In [None]:
model = XGBClassifier(random_state=3, n_estimators=2000)

scores = cross_val_score(model, data, label, cv=10, scoring = "accuracy")
print("Accuracy: %0.3f" % (scores.mean()))

Accuracy: 0.966
