In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
# Load all datasets
df_main = pd.read_csv("dataset.csv")
df_severity = pd.read_csv("Symptom-severity.csv")
df_description = pd.read_csv("symptom_Description.csv")
df_precaution = pd.read_csv("symptom_precaution.csv")


In [3]:
# Show shape and basic info
print("Main Dataset (Symptoms & Diseases):")
print(df_main.shape)
print(df_main.head(), "\n")


Main Dataset (Symptoms & Diseases):
(4920, 18)
            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   
3  Fungal infection     itching              skin_rash    dischromic _patches   
4  Fungal infection     itching              skin_rash   nodal_skin_eruptions   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic _patches       NaN       NaN       NaN       NaN       NaN   
1                   NaN       NaN       NaN       NaN       NaN       NaN   
2                   NaN       NaN       NaN       NaN       NaN       NaN   
3                   NaN       NaN       NaN       NaN       NaN       NaN   
4                   NaN       NaN       NaN       NaN       NaN       NaN   

  S

In [4]:
print("Symptom Severity Dataset:")
print(df_severity.shape)
print(df_severity.head(), "\n")


Symptom Severity Dataset:
(133, 2)
                Symptom  weight
0               itching       1
1             skin_rash       3
2  nodal_skin_eruptions       4
3   continuous_sneezing       4
4             shivering       5 



In [5]:
print("Symptom Description Dataset:")
print(df_description.shape)
print(df_description.head(), "\n")

Symptom Description Dataset:
(41, 2)
          Disease                                        Description
0   Drug Reaction  An adverse drug reaction (ADR) is an injury ca...
1         Malaria  An infectious disease caused by protozoan para...
2         Allergy  An allergy is an immune system response to a f...
3  Hypothyroidism  Hypothyroidism, also called underactive thyroi...
4       Psoriasis  Psoriasis is a common skin disorder that forms... 



In [6]:
print("Disease Precaution Dataset:")
print(df_precaution.shape)
print(df_precaution.head())

Disease Precaution Dataset:
(41, 5)
          Disease                      Precaution_1  \
0   Drug Reaction                   stop irritation   
1         Malaria          Consult nearest hospital   
2         Allergy                    apply calamine   
3  Hypothyroidism                     reduce stress   
4       Psoriasis  wash hands with warm soapy water   

                   Precaution_2        Precaution_3  \
0      consult nearest hospital    stop taking drug   
1               avoid oily food  avoid non veg food   
2       cover area with bandage                 NaN   
3                      exercise         eat healthy   
4  stop bleeding using pressure      consult doctor   

                  Precaution_4  
0                    follow up  
1           keep mosquitos out  
2  use ice to compress itching  
3             get proper sleep  
4                   salt baths  


In [7]:
print('There are {} Diseases'.format(df_main['Disease'].nunique()) )
df_main['Disease'].value_counts()

There are 41 Diseases


Disease
Fungal infection                           120
Hepatitis C                                120
Hepatitis E                                120
Alcoholic hepatitis                        120
Tuberculosis                               120
Common Cold                                120
Pneumonia                                  120
Dimorphic hemmorhoids(piles)               120
Heart attack                               120
Varicose veins                             120
Hypothyroidism                             120
Hyperthyroidism                            120
Hypoglycemia                               120
Osteoarthristis                            120
Arthritis                                  120
(vertigo) Paroymsal  Positional Vertigo    120
Acne                                       120
Urinary tract infection                    120
Psoriasis                                  120
Hepatitis D                                120
Hepatitis B                                120
Aller

In [8]:
df_main.isnull().sum()

Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64

In [10]:
df_main=df_main.fillna(0)

In [11]:
symptoms={}
for i in df_main.columns[1:]:
    sym=set(df_main[i])
    for j in sym:
        if j not in symptoms:
            symptoms[j]=1
        else:
            symptoms[j]+=1
    
symptoms

{' headache': 7,
 ' indigestion': 2,
 ' yellowish_skin': 6,
 ' cramps': 2,
 'itching': 1,
 ' chest_pain': 10,
 ' breathlessness': 7,
 ' chills': 3,
 ' high_fever': 6,
 ' pain_during_bowel_movements': 2,
 ' weight_loss': 4,
 ' constipation': 3,
 ' burning_micturition': 3,
 ' sunken_eyes': 2,
 ' muscle_weakness': 3,
 ' continuous_sneezing': 1,
 ' stiff_neck': 4,
 ' muscle_wasting': 1,
 ' weakness_in_limbs': 2,
 ' bladder_discomfort': 2,
 ' joint_pain': 3,
 ' vomiting': 4,
 ' back_pain': 3,
 ' mood_swings': 4,
 ' neck_pain': 3,
 ' skin_rash': 2,
 ' cough': 5,
 ' patches_in_throat': 2,
 ' shivering': 2,
 ' fatigue': 5,
 ' stomach_pain': 3,
 ' acidity': 2,
 ' pus_filled_pimples': 2,
 ' weight_gain': 2,
 ' dehydration': 2,
 ' sweating': 7,
 ' foul_smell_of urine': 2,
 ' blister': 2,
 ' skin_peeling': 2,
 ' abdominal_pain': 8,
 ' bruising': 2,
 ' cold_hands_and_feets': 2,
 ' dizziness': 5,
 ' restlessness': 3,
 ' ulcers_on_tongue': 2,
 ' blackheads': 2,
 ' weakness_of_one_body_side': 2,
 ' le

In [12]:
len(symptoms)

132

In [13]:
symp=list(symptoms.keys())
symp.remove(0)
symp

[' headache',
 ' indigestion',
 ' yellowish_skin',
 ' cramps',
 'itching',
 ' chest_pain',
 ' breathlessness',
 ' chills',
 ' high_fever',
 ' pain_during_bowel_movements',
 ' weight_loss',
 ' constipation',
 ' burning_micturition',
 ' sunken_eyes',
 ' muscle_weakness',
 ' continuous_sneezing',
 ' stiff_neck',
 ' muscle_wasting',
 ' weakness_in_limbs',
 ' bladder_discomfort',
 ' joint_pain',
 ' vomiting',
 ' back_pain',
 ' mood_swings',
 ' neck_pain',
 ' skin_rash',
 ' cough',
 ' patches_in_throat',
 ' shivering',
 ' fatigue',
 ' stomach_pain',
 ' acidity',
 ' pus_filled_pimples',
 ' weight_gain',
 ' dehydration',
 ' sweating',
 ' foul_smell_of urine',
 ' blister',
 ' skin_peeling',
 ' abdominal_pain',
 ' bruising',
 ' cold_hands_and_feets',
 ' dizziness',
 ' restlessness',
 ' ulcers_on_tongue',
 ' blackheads',
 ' weakness_of_one_body_side',
 ' lethargy',
 ' pain_in_anal_region',
 ' nodal_skin_eruptions',
 ' knee_pain',
 ' loss_of_appetite',
 ' swelling_joints',
 ' nausea',
 ' anxiety',

In [14]:
sort_symp=dict(sorted(symptoms.items(),key =lambda x: x[1],reverse=True))
sort_symp

{0: 14,
 ' chest_pain': 10,
 ' abdominal_pain': 8,
 ' loss_of_appetite': 8,
 ' yellowing_of_eyes': 8,
 ' malaise': 8,
 ' muscle_pain': 8,
 ' headache': 7,
 ' breathlessness': 7,
 ' sweating': 7,
 ' nausea': 7,
 ' diarrhoea': 7,
 ' yellowish_skin': 6,
 ' high_fever': 6,
 ' swelled_lymph_nodes': 6,
 ' irritability': 6,
 ' cough': 5,
 ' fatigue': 5,
 ' dizziness': 5,
 ' blurred_and_distorted_vision': 5,
 ' dark_urine': 5,
 ' phlegm': 5,
 ' weight_loss': 4,
 ' stiff_neck': 4,
 ' vomiting': 4,
 ' mood_swings': 4,
 ' lethargy': 4,
 ' swelling_joints': 4,
 ' obesity': 4,
 ' excessive_hunger': 4,
 ' fast_heart_rate': 4,
 ' depression': 4,
 ' mild_fever': 4,
 ' abnormal_menstruation': 4,
 ' red_spots_over_body': 4,
 ' chills': 3,
 ' constipation': 3,
 ' burning_micturition': 3,
 ' muscle_weakness': 3,
 ' joint_pain': 3,
 ' back_pain': 3,
 ' neck_pain': 3,
 ' stomach_pain': 3,
 ' restlessness': 3,
 ' loss_of_balance': 3,
 ' family_history': 3,
 ' painful_walking': 3,
 ' indigestion': 2,
 ' cramp

In [15]:
set(df_severity['Symptom'])-(set(symp))

{'abdominal_pain',
 'abnormal_menstruation',
 'acidity',
 'acute_liver_failure',
 'altered_sensorium',
 'anxiety',
 'back_pain',
 'belly_pain',
 'blackheads',
 'bladder_discomfort',
 'blister',
 'blood_in_sputum',
 'bloody_stool',
 'blurred_and_distorted_vision',
 'breathlessness',
 'brittle_nails',
 'bruising',
 'burning_micturition',
 'chest_pain',
 'chills',
 'cold_hands_and_feets',
 'coma',
 'congestion',
 'constipation',
 'continuous_feel_of_urine',
 'continuous_sneezing',
 'cough',
 'cramps',
 'dark_urine',
 'dehydration',
 'depression',
 'diarrhoea',
 'dischromic_patches',
 'distention_of_abdomen',
 'dizziness',
 'drying_and_tingling_lips',
 'enlarged_thyroid',
 'excessive_hunger',
 'extra_marital_contacts',
 'family_history',
 'fast_heart_rate',
 'fatigue',
 'fluid_overload',
 'foul_smell_ofurine',
 'headache',
 'high_fever',
 'hip_joint_pain',
 'history_of_alcohol_consumption',
 'increased_appetite',
 'indigestion',
 'inflammatory_nails',
 'internal_itching',
 'irregular_sugar

In [17]:
df1=pd.DataFrame()
df1['Disease']=df_main['Disease']
df1

Unnamed: 0,Disease
0,Fungal infection
1,Fungal infection
2,Fungal infection
3,Fungal infection
4,Fungal infection
...,...
4915,(vertigo) Paroymsal Positional Vertigo
4916,Acne
4917,Urinary tract infection
4918,Psoriasis


In [18]:
df1[symp]=0
df1

  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0
  df1[symp]=0


Unnamed: 0,Disease,headache,indigestion,yellowish_skin,cramps,itching,chest_pain,breathlessness,chills,high_fever,...,red_spots_over_body,receiving_unsterile_injections,sinus_pressure,coma,palpitations,stomach_bleeding,runny_nose,congestion,blood_in_sputum,loss_of_smell
0,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4916,Acne,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4917,Urinary tract infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4918,Psoriasis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
df1.columns=df1.columns.str.strip()
df1.fillna(0)
for i,row in df_main.iterrows():
    for col in df_main.columns[1:]:
        val=row[col]
        if val!=0 or val!='':
            df1.loc[i,val]=1

  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,val]=1
  df1.loc[i,

In [21]:
df1

Unnamed: 0,Disease,headache,indigestion,yellowish_skin,cramps,itching,chest_pain,breathlessness,chills,high_fever,...,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
0,Fungal infection,0,0,0,0,1,0,0,0,0,...,,,,,,,,,,
1,Fungal infection,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2,Fungal infection,0,0,0,0,1,0,0,0,0,...,,,,,,,,,,
3,Fungal infection,0,0,0,0,1,0,0,0,0,...,,,,,,,,,,
4,Fungal infection,0,0,0,0,1,0,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
4916,Acne,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
4917,Urinary tract infection,0,0,0,0,0,0,0,0,0,...,1.0,1.0,1.0,,,,,,,
4918,Psoriasis,0,0,0,0,0,0,0,0,0,...,,,,1.0,1.0,1.0,1.0,,,


In [22]:
df1.columns

Index([                  'Disease',                  'headache',
                     'indigestion',            'yellowish_skin',
                          'cramps',                   'itching',
                      'chest_pain',            'breathlessness',
                          'chills',                'high_fever',
       ...
             ' bladder_discomfort',      ' foul_smell_of urine',
       ' continuous_feel_of_urine',             ' skin_peeling',
            ' silver_like_dusting',     ' small_dents_in_nails',
             ' inflammatory_nails',                  ' blister',
           ' red_sore_around_nose',        ' yellow_crust_ooze'],
      dtype='object', length=263)

In [23]:
df1=df1.fillna(0)
y=df1['Disease'].unique()
X=df1.iloc[:,1:].values

In [24]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,KFold
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier

In [25]:
xtr,xts,ytr,yts=train_test_split(X,df1['Disease'],test_size=.15,random_state=42)
rf=RandomForestClassifier().fit(xtr,ytr)

In [26]:
rf.predict(xts)

array(['Acne', 'Acne', 'Hyperthyroidism', 'AIDS', 'Chronic cholestasis',
       'Hypertension ', 'Hypoglycemia', 'Arthritis', 'Hepatitis B',
       'Migraine', 'Urinary tract infection', 'Diabetes ', 'Hepatitis D',
       'Psoriasis', 'Alcoholic hepatitis', 'Alcoholic hepatitis',
       'Dimorphic hemmorhoids(piles)', 'Hepatitis E', 'Diabetes ',
       'Cervical spondylosis', 'Bronchial Asthma', 'hepatitis A',
       'Hepatitis B', 'Bronchial Asthma', 'Allergy', 'Hepatitis C',
       'Pneumonia', 'Migraine', 'Hypothyroidism', 'Migraine',
       'Chronic cholestasis', 'Hepatitis B', 'Gastroenteritis',
       'Hepatitis E', 'Varicose veins', 'Migraine', 'Jaundice',
       'Drug Reaction', 'Pneumonia', 'Urinary tract infection',
       'Hepatitis C', '(vertigo) Paroymsal  Positional Vertigo',
       'Hypertension ', 'Diabetes ', 'Jaundice', 'Gastroenteritis',
       'Hypoglycemia', 'Hepatitis B', 'Psoriasis', 'Psoriasis',
       'Heart attack', 'Tuberculosis', 'Varicose veins', 'Allergy',

In [27]:
rf.score(xts,yts)

1.0

In [28]:
import pickle
pickle.dump(rf,open('disease_model.pkl','wb'))