## This module performs Data preprocessing & improvises it with NLP & Web scraping

In [1]:
# Establish connection to use Google drive

from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# Filter & ignore warnings for clear output visualization

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Import necessary packages

import re
import csv
import math
import pickle
import operator
import requests
import numpy as np
import pandas as pd
from time import time
from statistics import mean
from bs4 import BeautifulSoup
from collections import Counter
from itertools import combinations
from collections import OrderedDict

In [4]:
# Download necessary NLP libraries

import nltk
nltk.download("wordnet")
nltk.download("stopwords")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# Import necessary NLP packages

from nltk.corpus import wordnet 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

In [6]:
# Global initializations

stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()
splitter = RegexpTokenizer(r'\w+')

In [7]:
# Do this only if you use Google colab

!pip3 install pickle5

Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[K     |████████████████████████████████| 256 kB 5.2 MB/s eta 0:00:01
[?25hInstalling collected packages: pickle5
Successfully installed pickle5-0.0.12


In [8]:
# Load dataset by setting its base path
# Defines a function to read the pickle file generated in Data collection step to obtain disease symptoms

import pickle5 as pickle

def readDiseaseSymptoms():
    
    disease_symptoms = None
    file_path = "drive/MyDrive/Dataset/final_disease_symptoms.pickle"

    with open(file_path, "rb") as handle:
        disease_symptoms = pickle.load(handle)
      
    #with open(file_path, 'rb') as handle:
        #disease_symptoms = pickle.load(handle)

    return disease_symptoms
    

In [9]:
# Defines a function that iterates over all diseases and preprocess respective symptoms string 
# by and breaking it into individual symptoms per disease

def getUniqueAndCleanedDiseaseSymptoms(disease_symptoms):
    
    # Set to store all unique symptoms
    unique_symptoms = set() 
    
    # Ordered disctionary of the form <Disease, [List of symptoms]>
    cleaned_diseases_symptoms = OrderedDict() 
    
    for key in sorted(disease_symptoms.keys()):
        value = disease_symptoms[key]
        
        # Normalize the case & split by comma
        symptoms_list = re.sub(r"\[\S+\]", "", value).lower().split(',')
        temp_list1 = symptoms_list
        symptoms_list = []
        
        # Adds into symptoms list only if it has atleast one symptom
        for symptom in temp_list1:
            if len(symptom.strip()) > 0:
                symptoms_list.append(symptom.strip())
                
        # Remove "none" from symptoms
        if "none" in symptoms_list: 
            symptoms_list.remove("none")
            
        # Initially, we do not have anything in the list & hence, we do not need further steps
        if len(symptoms_list) == 0:
            continue
            
        # Performs basic NLP processing (lemmatization) on Symptoms
        temp_list2 = []
        for symptom in symptoms_list:
            symptom = symptom.replace('-', ' ')
            symptom = symptom.replace("'", '')
            symptom = symptom.replace('(', '')
            symptom = symptom.replace(')', '')
            symptom = ' '.join([lemmatizer.lemmatize(word) for word in splitter.tokenize(symptom) 
                            if word not in stop_words and not word[0].isdigit()])
            unique_symptoms.add(symptom)
            temp_list2.append(symptom)
            
        # Adds the preprocessed list of symptoms associated with the respective disease into a dictionary
        cleaned_diseases_symptoms[key] = temp_list2

    # Sorts the total unique symptoms in ascending order
    unique_symptoms = list(unique_symptoms)
    unique_symptoms.sort()
    #print(len(unique_symptoms))
    
    return unique_symptoms, cleaned_diseases_symptoms


In [10]:
# Defines a function to return the list of synonyms of the input word from thesauras.com and wordnet
# This is a helper method for getSymptomSynonyms()

def getSynonymsFromThesaurusAndWithNLP(term):
    synonyms = []
    response = requests.get('https://www.thesaurus.com/browse/{}'.format(term))
    thesaurusContents = BeautifulSoup(response.content,  "html.parser")
    
    # Exception handling
    try:
        container = thesaurusContents.find('section', {'class': 'MainContentContainer'}) 
        row = container.find('div',{'class':'css-191l5o0-ClassicContentCard'})
        row = row.find_all('li')
        for synonym in row:
            synonyms.append(synonym.get_text())
    except:
        None
        
    for synonym in wordnet.synsets(term):
        synonyms += synonym.lemma_names()
        
    return set(synonyms) 

In [11]:
# Defines a method to store each symptom's synonyms as a list of words
# We consider several combinations of symptoms possible => All possible subsets of symptoms a disease can have
# This step takes 20-30 minutes

def getSymptomSynonyms(unique_symptoms):
    
    symptom_synonyms = dict()
    count = 0
    
    for symptom in unique_symptoms:
        count = count + 1
        if count % 50 == 0:
            print("Processed ", count, " unique symptoms")
            
        symp = symptom.split()
        all_symptom_combinations = set()
        
        for combination in range(1, len(symp)+1):
            for subset in combinations(symp, combination):
                subset = ' '.join(subset)
                subset = getSynonymsFromThesaurusAndWithNLP(subset) 
                all_symptom_combinations.update(subset)
         
        # Update & process stored_symptoms
        all_symptom_combinations.add(symptom)
        all_symptom_combinations = ' '.join(all_symptom_combinations).replace('_',' ').lower()
        all_symptom_combinations = list(set(all_symptom_combinations.split()))
        all_symptom_combinations.sort()
        
        symptom_synonyms[symptom] = all_symptom_combinations
        #print(symptom, ":", all_symptom_combinations)
        
    return symptom_synonyms
        

In [12]:
# Defines a function to find similarity between each symptom pair & eliminate the redundant ones
# For this, we find the Jaccard score
# If Jaccard > threshold, then those synonyms are similar and one of them can be used in place of another

def getUniqueSymptomsWithUniqueMeaning(unique_symptoms, symptom_synonyms):
    
    unique_symptoms = sorted(unique_symptoms, key=len, reverse=True) 
    matched_symptoms = dict()
    final_symptoms = set()
    
    for i, symptom_i in enumerate(unique_symptoms):
        for j in range(i+1, len(unique_symptoms)):
            
            symptom_j = unique_symptoms[j]
            synonyms_of_symptom_i = set(symptom_synonyms[symptom_i])
            synonyms_of_symptom_j = set(symptom_synonyms[symptom_j])
            
            # Finds Jaccard similarity: Lengths of Intersection over Union computation
            jaccard = len(synonyms_of_symptom_i.intersection(synonyms_of_symptom_j)) / len(synonyms_of_symptom_i.union(synonyms_of_symptom_j))
            
            if jaccard > 0.75:
                #print(symptom_i, "->", symptom_j)
                # Store similar symptoms in dictionary by replacing as matched_symptoms[symptom_j] = symptom_i
                if symptom_i in matched_symptoms.keys():
                    matched_symptoms[symptom_j] =  matched_symptoms[symptom_i]
                else:
                    matched_symptoms[symptom_j] = symptom_i
                    
    # Gets rid of common symptoms
    final_symptoms = set(unique_symptoms).difference(set(matched_symptoms.keys()))
    
    #print(len(new_symptoms))
    #print(matched_symptoms)
    
    # Sort the final list of symptoms obtained
    final_unique_symptoms = list(final_symptoms)
    final_unique_symptoms.sort()
    final_unique_symptoms = ['Disease_Name'] + final_unique_symptoms
    
    return final_unique_symptoms, matched_symptoms
    

In [24]:
# Defines a function that performs statistical analysis on the dataset we have
# After trying multiple alternatives and running for about a 100 times, I decided to leave few diseases 
# As they are outliers because 99% of the data falls below the length of 12 Symptoms

def analyzeDataAndClean(cleaned_diseases_symptoms):

    # Analyze the max, min & individual counts of symptoms associated with a disease
    maxLength = 0
    minLength = 0
    counts = dict()

    for key,values in cleaned_diseases_symptoms.items():
        count_val = 1
        count_key = "Length" + str(len(values))
        if count_key in counts:
            count_val = counts.get(count_key) + 1
        counts[count_key] = count_val

        if len(values) > maxLength:
            maxLength = len(values)
        if len(values) < minLength:
            minLength = len(values)

    print("Minimum length of symptoms associated with any disease is: ", minLength)
    print("Maximum length of symptoms associated with any disease is: ", maxLength, "\n")
    print(counts)  
    

In [26]:
# Defines a function to create CSV files with the data for Normal dataset
# This step takes 5-10 mins

def saveDataToNormalCSVfile(final_unique_symptoms, matched_symptoms, cleaned_diseases_symptoms):

    # Initialize a dataframe for normal dataset
    normal_df = pd.DataFrame(columns=final_unique_symptoms)
    count = 0
    
    # Read each disease and corresponding symptom list, convert it into dictionary and add to dataframe
    for key, values in cleaned_diseases_symptoms.items():

        # We remove length 31 symptoms because it is the only thing which falls in the outliers category
        if len(values) > 15 :
            continue

        count = count + 1
        if count % 50 == 0:
            print("Processed ", count, " symptoms")
        
        key = str.encode(key).decode('utf-8')
        unique_values = []

        if '' in values:
            values.remove('')
        
        # For similar symptoms, replace with the value in dictionary
        for symptom in values:
            if symptom in matched_symptoms.keys():
                unique_values.append(matched_symptoms[symptom])
            else:
                unique_values.append(symptom)

        values = list(set(unique_values))
        cleaned_diseases_symptoms[key] = values
        #print(key, "---> ", values, "\n")
        
        # Populate a row in normal dataset
        normal_row = dict({x:0 for x in final_unique_symptoms})
        
        for symptom in values:
            normal_row[symptom] = 1  
        normal_row['Disease_Name'] = key
        
        # Append row to the normal dataset's dataframe
        normal_df = normal_df.append(pd.Series(normal_row), ignore_index=True)

    normal_df.to_csv("drive/MyDrive/Dataset/Disease_Symptom_Dataset_For_Respective_Symptoms.csv", index=None)


In [31]:
# Defines a function to create CSV files with the data for Combination dataset
# This step takes 25-30 mins

def saveDataToCombinationCSVfile(final_unique_symptoms, matched_symptoms, cleaned_diseases_symptoms):
    
    # Initialize a dataframe for combination dataset
    combination_df = pd.DataFrame(columns=final_unique_symptoms)
    count = 0

    # Read each disease and corresponding symptom list, convert it into dictionary and add to dataframe
    for key, values in cleaned_diseases_symptoms.items():

       # We remove length 31 symptoms because it is the only thing which falls in the outliers category
        if len(values) > 15 :
            continue

        count = count + 1
        if count % 50 == 0:
            print("\n\nProcessed ", count, " diseases\n\n")

        key = str.encode(key).decode('utf-8')
        unique_values = []

        if '' in values:
            values.remove('')
        
        # For similar symptoms, replace with the value in dictionary
        for symptom in values:
            if symptom in matched_symptoms.keys():
                unique_values.append(matched_symptoms[symptom])
            else:
                unique_values.append(symptom)

        values = list(set(unique_values))
        cleaned_diseases_symptoms[key] = values
        print(key, "---> ", values)

        # So, based on this analysis on max & min of symptoms associated with a disease, we crunch the no. of subsets by increasing min subset length
        # Here, we consider the upper cap and take floor of that number as the minimum length of the subset
        # We use this length while making the subsets
        # This helps us to form a sensible dataset
        minSubsetLength = 0
        symptomsLength = len(values)

        # Consider subsets starting from length 1
        if symptomsLength <= 5:
            minSubsetLength = 1

        # Consider subsets starting from length 40% of symptoms length
        elif symptomsLength > 5 and symptomsLength <= 10:
            minSubsetLength = math.floor(symptomsLength * 0.4)

        # Consider subsets starting from length 50% of symptoms length
        elif symptomsLength >10 and symptomsLength <= 15:
            minSubsetLength = math.floor(symptomsLength * 0.5)

        # Populate all possible subset rows for combination dataset
        for combination in range(minSubsetLength, len(values) + 1):
            for subset in combinations(values, combination):
                combination_row = dict({x:0 for x in final_unique_symptoms})
                for symptom in list(subset):
                    combination_row[symptom] = 1
                    
                #print(list(subset))
                combination_row['Disease_Name'] = key
                
                # Append row to the combination dataset's dataframe for each combination
                combination_df = combination_df.append(pd.Series(combination_row), ignore_index=True)
                
                #try:
                    #matches = combination_df[(combination_df == comb_row).all(axis=1)]
                    #pandas.concat([df1,df2]).drop_duplicates().reset_index(drop=True)
                    #pd.concat([df1,df2], ignore_index=True)..drop_duplicates('Disease_Name')
                    #combination_df = combination_df.append(pd.Series(combination_row), ignore_index=True)
                #except Exception as e:
                    #print(str(e))
                             
    # Exports the dataset into CSV files      
    combination_df.to_csv("drive/MyDrive/Dataset/Disease_Symptom_Dataset_For_All_Symptom_Subsets.csv", index=None)


In [15]:
# Performs data preprocessing including NLP improvization
print("\n---------- Started Data preprocessing !! ----------")

# Obtains disease & respective symptoms data from the pickle file generated in Data Collection step
disease_symptoms = readDiseaseSymptoms()
print("\nFinished reading data from the pickle file !")



---------- Started Data preprocessing !! ----------

Finished reading data from the pickle file !


In [16]:
# Obtains all the Unique symptoms & Cleaned disease symptoms

unique_symptoms, cleaned_diseases_symptoms = getUniqueAndCleanedDiseaseSymptoms(disease_symptoms)
print("Obtained unique symptoms & cleaned disease-symptoms data !")
print("Total unique symptoms: ", len(unique_symptoms), "\n")


Obtained unique symptoms & cleaned disease-symptoms data !
Total unique symptoms:  623 



In [17]:
# Obtains all the synonyms associated with each symptom in all possible combinations

symptom_synonyms = getSymptomSynonyms(unique_symptoms)
print("\nObtained all synonyms associated with each symptom in all possible combinations !")


Processed  50  unique symptoms
Processed  100  unique symptoms
Processed  150  unique symptoms
Processed  200  unique symptoms
Processed  250  unique symptoms
Processed  300  unique symptoms
Processed  350  unique symptoms
Processed  400  unique symptoms
Processed  450  unique symptoms
Processed  500  unique symptoms
Processed  550  unique symptoms
Processed  600  unique symptoms

Obtained all synonyms associated with each symptom in all possible combinations !


In [28]:
# Obtains unique symptoms with unique meaning by removing similar sense symptoms

final_unique_symptoms, matched_symptoms = getUniqueSymptomsWithUniqueMeaning(unique_symptoms, symptom_synonyms)
if '' in final_unique_symptoms:
    final_unique_symptoms.remove('')
print("Obtained unique symptoms with unique word sense !")
print("Total unique symptoms with unique word sense: ", len(final_unique_symptoms), "\n")


Obtained unique symptoms with unique word sense !
Total unique symptoms with unique word sense:  565 



In [25]:
# Perform data analysis & clean dataset

analyzeDataAndClean(cleaned_diseases_symptoms)
print("Data analysis & clean up done !")


Minimum length of symptoms associated with any disease is:  0
Maximum length of symptoms associated with any disease is:  31 

{'Length4': 53, 'Length3': 65, 'Length1': 52, 'Length5': 42, 'Length2': 37, 'Length9': 2, 'Length10': 1, 'Length6': 17, 'Length7': 10, 'Length8': 6, 'Length31': 1, 'Length11': 3, 'Length12': 1}
Data analysis & clean up done !


In [29]:
# Saves data into CSV files - Part 1

saveDataToNormalCSVfile(final_unique_symptoms, matched_symptoms, cleaned_diseases_symptoms)
print("Saved data into Normal Dataset's CSV file !")


Processed  50  symptoms
Processed  100  symptoms
Processed  150  symptoms
Processed  200  symptoms
Processed  250  symptoms
Saved data into Normal Dataset's CSV file !


In [32]:
# Saves data into CSV files - Part 2

saveDataToCombinationCSVfile(final_unique_symptoms, matched_symptoms, cleaned_diseases_symptoms)
print("Saved data into Combination Dataset's CSV file !")
print("\n---------- Finished Data preprocessing successfully !! ----------")


Abnormal uterine bleeding --->  ['abnormally frequent', 'prolonged cough', 'excessive amount uterine bleeding', 'irregular']
Abscess --->  ['redness', 'ear pain', 'leg swelling']
Acquired capillary haemangioma of eyelid --->  ['raised red blue lesion']
Acquired immuno deficiency syndrome --->  ['flu like illness']
Acute encephalitis syndrome --->  ['confusion', 'stiff neck', 'headache', 'nausea vomiting', 'fever']
Adult inclusion conjunctivitis --->  ['scratchiness', 'reddish eye']
Alcohol abuse and alcoholism --->  ['drinking large amount alcohol long period', 'difficulty cutting', 'acquiring drinking alcohol taking lot time', 'usage resulting problem', 'withdrawal occurring stopping']
Alopecia (hair loss) --->  ['loss hair part head body']
Alzheimer --->  ['problem language', 'disorientation', 'mood swing', 'memory loss']
Amaurosis fugax --->  ['temporary fleeting vision one eye']
Amblyopia --->  ['decreased vision']
Amoebiasis --->  ['testicular pain', 'bloody diarrhea']
Anaemia ---