In [135]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

In [136]:
train = pd.read_csv("train_data.csv")
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3936 entries, 0 to 3935
Columns: 410 entries, index to Symptom_17_0
dtypes: int64(409), object(1)
memory usage: 12.3+ MB


In [137]:
test = pd.read_csv("test_data.csv")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Columns: 410 entries, index to Symptom_17_0
dtypes: int64(409), object(1)
memory usage: 3.1+ MB


In [138]:
data_d = pd.read_csv("data_disease_precautions.csv")
data_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Disease       41 non-null     object
 1   Precaution_1  41 non-null     object
 2   Precaution_2  41 non-null     object
 3   Precaution_3  41 non-null     object
 4   Precaution_4  41 non-null     object
 5   Description   41 non-null     object
dtypes: object(6)
memory usage: 2.0+ KB


In [139]:
#randomize train and test data
train = train.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)
# Split train and test
train_x = train.drop(["index", "Disease"], axis=1)
train_y = train["Disease"].values

test_x = test.drop(["index", "Disease"], axis=1)
test_y = test["Disease"].values

In [140]:
#sanity check
print (len(train_x))
print (len(train_y))
print (len(test_x))
print (len(test_y))

3936
3936
984
984


In [141]:
#One Hot Encoding
#Label Encoding
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(train_y)
train_y_endcoded = label_encoder.transform(train_y)
print (train_y_endcoded)
print (label_encoder.classes_)
print (label_encoder.inverse_transform(train_y_endcoded[100:120]))

[ 6 32 14 ... 14  3 39]
['(vertigo) Paroymsal  Positional Vertigo' 'AIDS' 'Acne'
 'Alcoholic hepatitis' 'Allergy' 'Arthritis' 'Bronchial Asthma'
 'Cervical spondylosis' 'Chicken pox' 'Chronic cholestasis' 'Common Cold'
 'Dengue' 'Diabetes ' 'Dimorphic hemmorhoids(piles)' 'Drug Reaction'
 'Fungal infection' 'GERD' 'Gastroenteritis' 'Heart attack' 'Hepatitis B'
 'Hepatitis C' 'Hepatitis D' 'Hepatitis E' 'Hypertension '
 'Hyperthyroidism' 'Hypoglycemia' 'Hypothyroidism' 'Impetigo' 'Jaundice'
 'Malaria' 'Migraine' 'Osteoarthristis' 'Paralysis (brain hemorrhage)'
 'Peptic ulcer diseae' 'Pneumonia' 'Psoriasis' 'Tuberculosis' 'Typhoid'
 'Urinary tract infection' 'Varicose veins' 'hepatitis A']
['Hepatitis D' 'Peptic ulcer diseae' 'Heart attack' 'Drug Reaction' 'GERD'
 'Gastroenteritis' 'Drug Reaction' 'Arthritis' 'Gastroenteritis'
 'Hyperthyroidism' 'Varicose veins' 'Gastroenteritis' 'Tuberculosis'
 'Drug Reaction' 'Osteoarthristis' 'Peptic ulcer diseae' 'Osteoarthristis'
 'Peptic ulcer disea

In [145]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'multi:softprob'}
param['nthread'] = 4
param['eval_metric'] = ['auc', "merror"]
param["num_class"]=41

dtrain = xgb.DMatrix(train_x, label=label_encoder.transform(train_y))
dtest = xgb.DMatrix(test_x, label=label_encoder.transform(test_y))
evallist = [(dtest, 'eval'), (dtrain, 'train')]

In [146]:
num_round = 10
bst = xgb.train(param, dtrain, num_round, evallist)

[0]	eval-auc:0.99897	eval-merror:0.03455	train-auc:0.99863	train-merror:0.03862
[1]	eval-auc:0.99976	eval-merror:0.01219	train-auc:0.99971	train-merror:0.00915
[2]	eval-auc:0.99985	eval-merror:0.00102	train-auc:0.99982	train-merror:0.00127
[3]	eval-auc:0.99990	eval-merror:0.00102	train-auc:0.99988	train-merror:0.00127
[4]	eval-auc:0.99997	eval-merror:0.00102	train-auc:0.99997	train-merror:0.00127
[5]	eval-auc:0.99999	eval-merror:0.00102	train-auc:0.99999	train-merror:0.00127
[6]	eval-auc:1.00000	eval-merror:0.00102	train-auc:0.99999	train-merror:0.00127
[7]	eval-auc:1.00000	eval-merror:0.00102	train-auc:1.00000	train-merror:0.00127
[8]	eval-auc:1.00000	eval-merror:0.00000	train-auc:1.00000	train-merror:0.00000
[9]	eval-auc:1.00000	eval-merror:0.00000	train-auc:1.00000	train-merror:0.00000


In [147]:
def predict(ddata):
    prediction = bst.predict(ddata)
    predict_index = []
    for x in prediction:
        predict_index.append(np.argmax(x))
    return label_encoder.inverse_transform(predict_index)

In [148]:
predict_arr = predict(dtest)

In [149]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(test_y, predict_arr)

1.0

In [151]:
# link https://xgboost.readthedocs.io/en/latest/parameter.html

In [152]:
# Save Model
bst.save_model('xgbboost_1.model')

In [156]:
# Load Model
bst = xgb.Booster({'nthread': 4})  # init model
bst.load_model('xgbboost_1.model')  # load data