## This module performs Web scraping to obtain list of Diseases & respective Symptoms

In [1]:
# Filter & ignore warnings for clear output visualization

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Install google module for python

!pip install google



In [3]:
# Import all necessary packages

import re
import time
import pickle
import string
import requests
from bs4 import BeautifulSoup
from googlesearch import search

### Functions to Obtain list of diseases from the website: www.nhp.gov.in

In [4]:
# Defines a function to obtain diseases list from nhp

def getDiseasesFromNhp():

    # As diseases are ordered alphabetically in the website, we first maintain a list of lower case letters
    lowercase_letters = list(string.ascii_lowercase)

    # Read diseases from the website using the lowercase_letters list
    diseases = []
    base_url = "https://www.nhp.gov.in/disease-a-z/"

    for letter in lowercase_letters:

        # Track progress
        print(" --> Obtaining diseases that start with " + letter)

        # Form the target URL we are interested in
        target_url = base_url + letter

        # Add some sleep time to avoid affecting the performance of the web server as it can be problematic for us
        time.sleep(1)

        # Obtain target page without verifying certificate
        target_page = requests.get(target_url, verify=False)

        # Pull data from the HTML & XML pages by web scraping using BeautifulSoup
        page_contents = BeautifulSoup(target_page.content, "html5lib")

        # Retrieve all diseases from the page. We know the page structure by inspecting the html structure of the page on web
        all_diseases = page_contents.find("div", class_="all-disease")

        # Within the <div>(all-disease), we have a <ul> inside which we have several <a> which contain <li> with disease names
        # So, from the obtained information, we perform futher processing to read the disease names
        for tag in all_diseases.find_all("li"):
            diseases.append(tag.get_text().strip())

        # Obtain set of diseases
        diseases_set1 = set(diseases)
        
    return diseases_set1


In [5]:
# Defines a function to obtain diseases from pickle file

def getDiseasesFromPickle():
    # We obtain few other diseases too
    other_diseases = None

    # We use pickle to handle data serialization here
    with open("other_diseases_pkl.txt", "rb") as handle:
        other_diseases = pickle.load(handle)

    # Obtain set of diseases
    diseases_set2 = set(other_diseases)
        
    return diseases_set2


In [6]:
# Defines to a function to obtain final list of diseases

def getFinalDiseases(diseases1, diseases2):
    
    all_diseases_list = list(diseases1.union(diseases2))
    
    # To make sure every diseases name starts with upper case so that sort order won't get effected
    # We use capitalize() on each disease name
    capitalized_diseases_list = []
    
    for disease in all_diseases_list:
        capitalized_diseases_list.append(disease.capitalize())
    
    # Sort all the diseases for convenience
    capitalized_diseases_list.sort()                

    # Analysis of individual disease sets we obtained
    len1 = len(diseases1)
    len2 = len(diseases2)
    len_common = len(diseases1.intersection(diseases2))

    # Print analysis results
    #print("Diseases1 length: ", len1, " ---- Diseases2 length: ", len2, " ---- Common diseases length: ", len_common)
    print("Total diseases: ", len(capitalized_diseases_list))
    
    # print final list of diseases
    #print(all_diseases_list)
    
    return capitalized_diseases_list


### Functions to Obtain list of symptoms associated with symptoms from Wikipedia

In [7]:
# Defines a function to get symptoms for respective disease

def getSymptomsFromWiki(all_diseases):
    
    # Final dictionary of symptoms
    disease_symptoms = {}
    count = 0
    
    for disease in all_diseases:
        
        count = count + 1
        if count % 10 == 0:
            print("Processed ", count, " diseases")
        if count % 100 == 0:
            print("-------------------------------------")
        
        #print("Currently processing the disease: " + disease)
        
        # Build search query
        search_query = disease + "wikipedia"

        # Search "disease wikipedia" on Google
        for search_result in search(search_query, tld="co.in", stop=10, pause=0.5):

            # Open wikipedia link
            match = re.search(r"wikipedia", search_result)

            filled = 0

            if match:
                wiki_page = requests.get(search_result, verify=False)
                wiki_contents = BeautifulSoup(wiki_page.content, "html5lib")
                
                #print(wiki_contents)

                # Fetch HTML code for "infobox"
                info_table = wiki_contents.find("table", {"class":"infobox"})
                #print(info_table)

                if info_table is not None:
          
                    # Preprocess contents of infobox
                    for row in info_table.find_all("tr"):
                        #print(row)
                                 
                        row_data = row.find("th", {"scope":"row"})
                        #print(row_data)
                        #print("------------------")
                        
                        if row_data is not None:
                            row_data = row_data.get_text()
                            #print(row_data)

                            if row_data == "Symptoms":
                                symptom = str(row.find("td"))
                
                                symptom = symptom.replace(".", "")
                                symptom = symptom.replace(";", ",")

                                # Remove bold text
                                symptom=re.sub(r'<b.*?/b>:',',',symptom)

                                # Remove hyperlink
                                symptom=re.sub(r'<a.*?>','',symptom)
                                symptom=re.sub(r'</a>','',symptom)

                                # Remove all the tags
                                symptom=re.sub(r'<[^<]+?>',', ',symptom)

                                # Remove citation text
                                symptom=re.sub(r'\[.*\]','',symptom)
                                symptom=' '.join([x for x in symptom.split() if x != ','])

                                # print(symptom)
                                
                                # Update symptoms
                                disease_symptoms[disease] = symptom

                                filled = 1
                                break

                if filled == 1:
                    break
                
    return disease_symptoms


In [12]:
# Defines a function that does preprocessing on the disease and associated symptoms list 

def saveSymptomsToPickle(disease_symptoms):
    # Remove diseases that are associated with duplicate symptoms list
    # print(len(disease_symptoms))
    
    temp_list = []
    temp_dict = {}
        
    for key, val in disease_symptoms.items():
        # Here, val is a list of symptoms associated with the disease
        # So, we check if the same list already exists in the the temporary list we have
        if val not in temp_list:
            temp_dict[key] = val
            temp_list.append(val)
        
    disease_symptoms = temp_dict
    print("Total diseases considered after pre-processing: ", len(disease_symptoms))
    
    # Save the dictionary in a pickle file
    with open("final_disease_symptoms.pickle", "wb") as handle:
        pickle.dump(disease_symptoms, handle, protocol=pickle.HIGHEST_PROTOCOL)
            

### Execution steps to Obtain diseases & the symptoms associated with them

In [9]:
# Obtains consolidated list of diseases (This step takes 8-10 mins to execute)

print("\n--------- OBTAINING LIST OF DISEASES ---------\n")
diseases1 = getDiseasesFromNhp()
diseases2 = getDiseasesFromPickle()
all_diseases = getFinalDiseases(diseases1, diseases2)
#print(all_diseases)



--------- OBTAINING LIST OF DISEASES ---------

 --> Obtaining diseases that start with a
 --> Obtaining diseases that start with b
 --> Obtaining diseases that start with c
 --> Obtaining diseases that start with d
 --> Obtaining diseases that start with e
 --> Obtaining diseases that start with f
 --> Obtaining diseases that start with g
 --> Obtaining diseases that start with h
 --> Obtaining diseases that start with i
 --> Obtaining diseases that start with j
 --> Obtaining diseases that start with k
 --> Obtaining diseases that start with l
 --> Obtaining diseases that start with m
 --> Obtaining diseases that start with n
 --> Obtaining diseases that start with o
 --> Obtaining diseases that start with p
 --> Obtaining diseases that start with q
 --> Obtaining diseases that start with r
 --> Obtaining diseases that start with s
 --> Obtaining diseases that start with t
 --> Obtaining diseases that start with u
 --> Obtaining diseases that start with v
 --> Obtaining diseases tha

In [13]:
# Obtains preprocessed list of symptoms associated with diseases (This step takes 25-30 mins to execute)

print("\n--------- OBTAINING LIST OF SYMPTOMS ASSOCIATED WITH DISEASES ---------\n")
disease_symptoms = getSymptomsFromWiki(all_diseases)
print("Finished successfully !")



--------- OBTAINING LIST OF SYMPTOMS ASSOCIATED WITH DISEASES ---------

Processed  10  diseases
Processed  20  diseases
Processed  30  diseases
Processed  40  diseases
Processed  50  diseases
Processed  60  diseases
Processed  70  diseases
Processed  80  diseases
Processed  90  diseases
Processed  100  diseases
-------------------------------------
Processed  110  diseases
Processed  120  diseases
Processed  130  diseases
Processed  140  diseases
Processed  150  diseases
Processed  160  diseases
Processed  170  diseases
Processed  180  diseases
Processed  190  diseases
Processed  200  diseases
-------------------------------------
Processed  210  diseases
Processed  220  diseases
Processed  230  diseases
Processed  240  diseases
Processed  250  diseases
Processed  260  diseases
Processed  270  diseases
Processed  280  diseases
Processed  290  diseases
Processed  300  diseases
-------------------------------------
Processed  310  diseases
Processed  320  diseases
Processed  330  disea

In [14]:
# Saves the obtained result to the pickle file
#print(disease_symptoms)

saveSymptomsToPickle(disease_symptoms)
print("Diseases & Symptoms saved successfully !")


Total diseases considered after pre-processing:  292
Diseases & Symptoms saved successfully !
