Importing the necessary libraries.

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import csv


Loading the dataset.

In [2]:
DATASET_PATH = 'dataset/weighted-narrowed_data.csv'
df = pd.read_csv(DATASET_PATH)
df

Unnamed: 0,Disease,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abnormally hard consistency.1,...,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum,yellow sputum.1
0,cellulitis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,hypercholesterolemia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,exanthema,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,edema pulmonary,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,neoplasm metastasis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5399,myocardial infarction,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5400,neoplasm,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5401,hyperlipidemia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5402,neoplasm,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Divide the data into train and test sets.

In [10]:
target = df['Disease']
features = df.loc[:, df.columns != 'Disease']


X_train, X_test, y_train, y_test = train_test_split(features, target, stratify=target, train_size=0.25, random_state=1121218)


Define the model.

In [11]:
# Init classifier
xgb_cl = xgb.XGBClassifier()
# Fit
xgb_cl.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

Checking the accuracy of the model.

In [32]:
# Predict
preds = xgb_cl.predict(X_test)
preds_proba = xgb_cl.predict_proba(X_test)

print(preds)
print(preds_proba)

for i, j in zip(preds, preds_proba):
  print("Prediction: {} | Probability: {}".format(i, np.max(j)))

# Score
model_accuracy = accuracy_score(y_test, preds)
print('Model Accuracy: {}%'.format(model_accuracy*100))


['obesity' 'paroxysmal  dyspnea' 'myocardial  infarction' ... 'neoplasm'
 'hernia' 'neoplasm']
[[0.00046931 0.0004309  0.00044947 ... 0.00038568 0.00039424 0.0003869 ]
 [0.00060367 0.00055427 0.00057815 ... 0.0004961  0.00058202 0.00049767]
 [0.0001504  0.00013809 0.00014404 ... 0.0001236  0.00012634 0.00012399]
 ...
 [0.00035493 0.00032588 0.00033993 ... 0.00038551 0.00029815 0.0002926 ]
 [0.00060357 0.00055418 0.00057806 ... 0.00049603 0.00050703 0.00056253]
 [0.00035493 0.00032588 0.00033993 ... 0.00038551 0.00029815 0.0002926 ]]
Prediction: obesity | Probability: 0.9919725656509399
Prediction: paroxysmal  dyspnea | Probability: 0.9894815683364868
Prediction: myocardial  infarction | Probability: 0.9974368810653687
Prediction: cellulitis | Probability: 0.9944232106208801
Prediction: cellulitis | Probability: 0.9944232106208801
Prediction: paroxysmal  dyspnea | Probability: 0.9894815683364868
Prediction: myocardial  infarction | Probability: 0.9974368810653687
Prediction: sickle  cel

['obesity',
 'paroxysmal  dyspnea',
 'myocardial  infarction',
 'cellulitis',
 'cellulitis',
 'paroxysmal  dyspnea',
 'myocardial  infarction',
 'sickle  cell anemia',
 'hypercholesterolemia',
 'cardiomyopathy',
 'neoplasm',
 'ulcer  peptic',
 'hypercholesterolemia',
 'anxiety  state',
 'anxiety  state',
 'stenosis  aortic valve',
 'embolism  pulmonary',
 'exanthema',
 'edema  pulmonary',
 'cardiomyopathy',
 'kidney  disease',
 'ischemia',
 'hypercholesterolemia',
 'hypercholesterolemia',
 'anxiety  state',
 'hypercholesterolemia',
 'hyperlipidemia',
 'cellulitis',
 'neoplasm',
 'embolism  pulmonary',
 'hypercholesterolemia',
 'hernia',
 'neoplasm',
 'hernia',
 'cellulitis',
 'neoplasm',
 'myocardial  infarction',
 'ischemia',
 'hypercholesterolemia',
 'myocardial  infarction',
 'stenosis  aortic valve',
 'ischemia',
 'anxiety  state',
 'myocardial  infarction',
 'neoplasm',
 'edema  pulmonary',
 'ischemia',
 'ischemia',
 'embolism  pulmonary',
 'ischemia',
 'hernia',
 'myocardial  inf