# Disease Prediction System

A machine learning model that predicts diseases based on the provided symptoms.

## Importing the libraries

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the dataset

In [4]:
dataset = pd.read_csv('datasets/Training.csv')
print(dataset.head())

   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  \
0       0           0             0        0                 0  ...   
1       0           0             0        0                 0  ...   
2       0           0             0        0                 0  ...   
3       0           0             0        0                 0  ...   
4       0           0             0        0                 0  ...   

   blackheads  scurring  skin_peeling  silver_like_dusting  \
0           0         0             

In [7]:
dataset.shape

(4920, 133)

In [74]:
dataset['prognosis'].unique()

array(['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis',
       'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes ',
       'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine',
       'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice',
       'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'hepatitis A',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia',
       'Dimorphic hemmorhoids(piles)', 'Heart attack', 'Varicose veins',
       'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia',
       'Osteoarthristis', 'Arthritis',
       '(vertigo) Paroymsal  Positional Vertigo', 'Acne',
       'Urinary tract infection', 'Psoriasis', 'Impetigo'], dtype=object)

In [8]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Encoding Prognosis

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
le = LabelEncoder()
le.fit(y)
y = le.transform(y)

In [14]:
print(y)

[15 15 15 ... 38 35 27]


# Training, Testing and Checking Classifier Models Accuracy

## Support Vector Machine (SVM)

In [24]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score

svc = SVC(kernel='linear')
svc.fit(X_train, y_train)

In [25]:
svc_pred = svc.predict(X_test)

In [26]:
svc_confusion_matrix = confusion_matrix(y_test, svc_pred)
svc_accuracy = accuracy_score(y_test, svc_pred)

In [27]:
print(f"Confusion matrix of SVC: \n{svc_confusion_matrix}")
print(f"Accuracy of SVC: {svc_accuracy}")

Confusion matrix of SVC: 
[[35  0  0 ...  0  0  0]
 [ 0 32  0 ...  0  0  0]
 [ 0  0 37 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 44]]
Accuracy of SVC: 1.0


## Random Forest Classification

In [31]:
from sklearn.ensemble import RandomForestClassifier

rforest = RandomForestClassifier(n_estimators=100)
rforest.fit(X_train, y_train)

In [32]:
rforest_pred = rforest.predict(X_test)

In [33]:
rforest_confusion_matrix = confusion_matrix(y_test, rforest_pred)
rforest_accuracy = accuracy_score(y_test, rforest_pred)

In [34]:
print(f"Confusion matrix of Random Forest: \n{rforest_confusion_matrix}")
print(f"Accuracy of Random Forest: {rforest_accuracy}")

Confusion matrix of Random Forest: 
[[35  0  0 ...  0  0  0]
 [ 0 32  0 ...  0  0  0]
 [ 0  0 37 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 44]]
Accuracy of Random Forest: 1.0


## Gradient Boosting Classifier

In [35]:
from sklearn.ensemble import GradientBoostingClassifier

gboost = GradientBoostingClassifier(n_estimators=100)
gboost.fit(X_train, y_train)

In [37]:
gboost_pred = gboost.predict(X_test)

In [38]:
gboost_confusion_matrix = confusion_matrix(y_test, gboost_pred)
gboost_accuracy = accuracy_score(y_test, gboost_pred)

In [39]:
print(f"Confusion matrix of Gradient Boost: \n{gboost_confusion_matrix}")
print(f"Accuracy of Gradient Boost: {gboost_accuracy}")

Confusion matrix of Gradient Boost: 
[[35  0  0 ...  0  0  0]
 [ 0 32  0 ...  0  0  0]
 [ 0  0 37 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 44]]
Accuracy of Gradient Boost: 1.0


## K Nearest Neighbors

In [40]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [41]:
knn_pred = knn.predict(X_test)

In [42]:
knn_confusion_matrix = confusion_matrix(y_test, knn_pred)
knn_accuracy = accuracy_score(y_test, knn_pred)

In [43]:
print(f"Confusion matrix of KNN: \n{knn_confusion_matrix}")
print(f"Accuracy of KNN: {knn_accuracy}")

Confusion matrix of KNN: 
[[35  0  0 ...  0  0  0]
 [ 0 32  0 ...  0  0  0]
 [ 0  0 37 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 44]]
Accuracy of KNN: 1.0


## Naive Bayes

In [44]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)

In [45]:
nb_pred = nb.predict(X_test)

In [46]:
nb_confusion_matrix = confusion_matrix(y_test, nb_pred)
nb_accuracy = accuracy_score(y_test, nb_pred)

In [47]:
print(f"Confusion matrix of Naive Bayes: \n{nb_confusion_matrix}")
print(f"Accuracy of Naive Bayes: {nb_accuracy}")

Confusion matrix of Naive Bayes: 
[[35  0  0 ...  0  0  0]
 [ 0 32  0 ...  0  0  0]
 [ 0  0 37 ...  0  0  0]
 ...
 [ 0  0  0 ... 43  0  0]
 [ 0  0  0 ...  0 37  0]
 [ 0  0  0 ...  0  0 44]]
Accuracy of Naive Bayes: 1.0


# Single Prediction

In [64]:
print(svc.predict(X_test[37].reshape(1, -1))[0])
print(y_test[37])

Hypoglycemia
Hypoglycemia


In [65]:
print(rforest.predict(X_test[120].reshape(1, -1))[0])
print(y_test[120])

Hypothyroidism
Hypothyroidism


In [66]:
print(gboost.predict(X_test[199].reshape(1, -1))[0])
print(y_test[199])

Migraine
Migraine


In [67]:
print(knn.predict(X_test[287].reshape(1, -1))[0])
print(y_test[287])

Osteoarthristis
Osteoarthristis


In [68]:
print(nb.predict(X_test[997].reshape(1, -1))[0])
print(y_test[997])

Hyperthyroidism
Hyperthyroidism


# Exporting Models as pkl

In [70]:
import pickle

In [71]:
pickle.dump(svc, open('models/svc.pkl', 'wb'))
pickle.dump(rforest, open('models/rforest.pkl', 'wb'))
pickle.dump(gboost, open('models/gboost.pkl', 'wb'))
pickle.dump(knn, open('models/knn.pkl', 'wb'))
pickle.dump(nb, open('models/nb.pkl', 'wb'))

# Predicting Diseases with Symptoms

In [77]:
diseases = dataset['prognosis'].unique()
print(diseases)

['Fungal infection' 'Allergy' 'GERD' 'Chronic cholestasis' 'Drug Reaction'
 'Peptic ulcer diseae' 'AIDS' 'Diabetes ' 'Gastroenteritis'
 'Bronchial Asthma' 'Hypertension ' 'Migraine' 'Cervical spondylosis'
 'Paralysis (brain hemorrhage)' 'Jaundice' 'Malaria' 'Chicken pox'
 'Dengue' 'Typhoid' 'hepatitis A' 'Hepatitis B' 'Hepatitis C'
 'Hepatitis D' 'Hepatitis E' 'Alcoholic hepatitis' 'Tuberculosis'
 'Common Cold' 'Pneumonia' 'Dimorphic hemmorhoids(piles)' 'Heart attack'
 'Varicose veins' 'Hypothyroidism' 'Hyperthyroidism' 'Hypoglycemia'
 'Osteoarthristis' 'Arthritis' '(vertigo) Paroymsal  Positional Vertigo'
 'Acne' 'Urinary tract infection' 'Psoriasis' 'Impetigo']


In [94]:
major_symptoms = dataset.columns[:-1]
print(major_symptoms)
print(len(major_symptoms))

Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'pus_filled_pimples', 'blackheads', 'scurring', 'skin_peeling',
       'silver_like_dusting', 'small_dents_in_nails', 'inflammatory_nails',
       'blister', 'red_sore_around_nose', 'yellow_crust_ooze'],
      dtype='object', length=132)
132


## Load database and recommendation logic

In [120]:
symtom_des = pd.read_csv("datasets/symtoms_df.csv")
precautions = pd.read_csv("datasets/precautions_df.csv")
workout = pd.read_csv("datasets/workout_df.csv")
description = pd.read_csv("datasets/description.csv")
medications = pd.read_csv("datasets/medications.csv")
diets = pd.read_csv("datasets/diets.csv")

In [230]:
def helper(disease):
    desc = description[description['Disease'] == disease]['Description']
    desc = " ".join([w for w in desc])

    pre = precautions[precautions['Disease'] == disease][['Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4']]
    pre = [col for col in pre.values][0]

    med = medications[medications['Disease'] == disease]['Medication']
    med = [med for med in med.values][0].strip("[]").replace("'", "").split(", ")

    die = diets[diets['Disease'] == disease]['Diet']
    die = [die for die in die.values][0].strip("[]").replace("'", "").split(", ")

    workouts = workout[workout['disease'] == disease]['workout']
    workouts = [workout for workout in workouts.values]

    return desc, pre, med, die, workouts

In [231]:
def predict_disease(symptoms):
    symptoms_list = np.zeros(len(major_symptoms))

    for symp in symptoms:
        indx = np.where(major_symptoms == symp.lower().replace(' ', '_'))[0]
        symptoms_list[indx] = 1
    disease = svc.predict(symptoms_list.reshape(1, -1))[0]

    disease_info = helper(disease)
    result = {
        "Disease": disease,
        "Description": disease_info[0],
        "Precautions": disease_info[1],
        "Medications": disease_info[2],
        "Diet": disease_info[3],
        "Workouts": disease_info[4],
    }
    return result

In [232]:
print(predict_disease(['itching', 'red sore around nose', 'blister']))

{'Disease': 'Impetigo', 'Description': 'Impetigo is a highly contagious skin infection causing red sores that can break open.', 'Precautions': array(['soak affected area in warm water', 'use antibiotics',
       'remove scabs with wet compressed cloth', 'consult doctor'],
      dtype=object), 'Medications': ['Topical antibiotics', 'Oral antibiotics', 'Antiseptics', 'Ointments', 'Warm compresses'], 'Diet': ['Impetigo Diet', 'Antibiotic treatment', 'Fruits and vegetables', 'Hydration', 'Protein-rich foods'], 'Workouts': ['Maintain good hygiene', 'Stay hydrated', 'Consume nutrient-rich foods', 'Limit sugary foods and beverages', 'Include foods rich in vitamin C', 'Consult a healthcare professional', 'Follow medical recommendations', 'Avoid scratching', 'Take prescribed antibiotics', 'Practice wound care']}


In [233]:
print(predict_disease(['inflammatory nails', 'pus filled pimples']))

{'Disease': 'Acne', 'Description': 'Acne is a skin condition that occurs when hair follicles become clogged with oil and dead skin cells.', 'Precautions': array(['bath twice', 'avoid fatty spicy food', 'drink plenty of water',
       'avoid too many products'], dtype=object), 'Medications': ['Antibiotics', 'Pain relievers', 'Antihistamines', 'Corticosteroids', 'Topical treatments'], 'Diet': ['Acne Diet', 'Low-Glycemic Diet', 'Hydration', 'Fruits and vegetables', 'Probiotics'], 'Workouts': ['Consume a balanced diet', 'Limit dairy and high-glycemic foods', 'Include antioxidants', 'Stay hydrated', 'Limit processed foods', 'Include zinc-rich foods', 'Consult a skincare professional', 'Practice good skincare hygiene', 'Limit sugary foods and beverages', 'Follow medical recommendations']}


In [234]:
print(helper('Acne')[3])

['Acne Diet', 'Low-Glycemic Diet', 'Hydration', 'Fruits and vegetables', 'Probiotics']
