In [1]:
import pandas as pd
import numpy as np
import pickle

### Load dataset

In [2]:
symp_dict = pickle.load(open("Disease-symptoms.pkl",'rb'))

In [3]:
symp_dict

{'Alcohol Abuse and Alcoholism': 'Drinking large amounts of alcohol over a long period, difficulty cutting down, acquiring and drinking alcohol taking up a lot of time, usage resulting in problems, withdrawal occurring when stopping,',
 'Alopecia (hair loss)': 'Loss of hair from part of the head or body,',
 'Anxiety': 'Social isolation, hypervigilance, feeling of inferiority, low self-esteem, difficulty socializing with others,',
 'Appendicitis': 'Right lower abdominal pain, vomiting, decreased appetite,',
 'Autism': 'Trouble with social interaction, verbal and nonverbal communication, and presence of restricted interests and repetitive behavior,',
 'Acquired Immuno Deficiency Syndrome': 'Flu-like illness,',
 "Alzheimer's Disease": 'Difficulty in remembering recent events, problems with language, disorientation, mood swings,',
 'Anaemia': 'Feeling tired, pale skin, weakness, shortness of breath, feeling like passing out,',
 'Arthritis': 'Joint pain, stiffness, redness, swelling, decrea

# Data Preprocessing

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from time import time

In [18]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
stop_words = stopwords.words('english')

In [3]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [6]:
lemmatizer  = WordNetLemmatizer()

In [7]:
from nltk.tokenize import RegexpTokenizer

In [8]:
splitter = RegexpTokenizer(r'\w+')

In [9]:
total_symp = set()

In [10]:
cleaned_symp_dict = {}

In [11]:
for key in symp_dict.keys():
    value = symp_dict[key]
    symp_list = value.lower().split(',')
    temp_list = []
    for sym in symp_list:
        if len(sym.strip())>0:
            temp_list.append(sym.strip())
    if "none" in temp_list:
        temp_list.remove("none")
    if len(temp_list)==0:
        continue
    temp =[]
    for sym in temp_list:
        sym=sym.replace('-',' ')
        sym=sym.replace("'",'')
        sym=sym.replace('(','')
        sym=sym.replace(')','')
        sym = ' '.join([lemmatizer.lemmatize(word) for word in splitter.tokenize(sym) if word not in stop_words and not word[0].isdigit()])
        total_symp.add(sym)
        temp.append(sym)
    cleaned_symp_dict[key] = temp
    
    
    

In [12]:
cleaned_symp_dict

{'Alcohol Abuse and Alcoholism': ['drinking large amount alcohol long period',
  'difficulty cutting',
  'acquiring drinking alcohol taking lot time',
  'usage resulting problem',
  'withdrawal occurring stopping'],
 'Alopecia (hair loss)': ['loss hair part head body'],
 'Anxiety': ['social isolation',
  'hypervigilance',
  'feeling inferiority',
  'low self esteem',
  'difficulty socializing others'],
 'Appendicitis': ['right lower abdominal pain',
  'vomiting',
  'decreased appetite'],
 'Autism': ['trouble social interaction',
  'verbal nonverbal communication',
  'presence restricted interest repetitive behavior'],
 'Acquired Immuno Deficiency Syndrome': ['flu like illness'],
 "Alzheimer's Disease": ['difficulty remembering recent event',
  'problem language',
  'disorientation',
  'mood swing'],
 'Anaemia': ['feeling tired',
  'pale skin',
  'weakness',
  'shortness breath',
  'feeling like passing'],
 'Arthritis': ['joint pain',
  'stiffness',
  'redness',
  'swelling',
  'decreas

In [15]:
total_symptoms = list(total_symp)
total_symptoms.sort()
total_symptoms=['disease_name']+total_symptoms

In [16]:
total_symptoms

['disease_name',
 'abdominal pain',
 'abnormal bleeding',
 'abnormal sensation',
 'abnormally frequent',
 'abnormally small head brain',
 'abscess',
 'acne',
 'acquiring drinking alcohol taking lot time',
 'affected part turning white',
 'anemia',
 'anxiety',
 'arm',
 'asthenopia',
 'asymptomatic',
 'back',
 'bad breath',
 'bad smelling vaginal discharge',
 'barking cough',
 'better sitting worse lying',
 'birth baby younger week gestational age',
 'black area skin',
 'bleeding',
 'bleeding gum',
 'bleeding skin',
 'blindness',
 'blister break open form small ulcer',
 'bloating',
 'blood stool',
 'bloody diarrhea',
 'blue',
 'blurred vision',
 'blurry vision',
 'breast',
 'breathing problem',
 'bruising',
 'burning',
 'burning sensation',
 'burning stabbing pain',
 'burning urination',
 'case asymptomatic',
 'certain thought repeatedly',
 'change bowel movement',
 'change breast shape',
 'change hair',
 'change reflex',
 'change skin color red black',
 'change sleeping eating pattern',

In [17]:
df = pd.DataFrame(columns = total_symptoms)

In [20]:
from itertools import combinations
from tqdm import tqdm

In [21]:
for key,value in tqdm(cleaned_symp_dict.items()):
    key = str.encode(key).decode('utf-8')
    for comb in range(1,len(value)+1):
        for subset in combinations(value,comb):
            row_comb = dict({x:0 for x in total_symptoms})
            for sym in list(subset):
                row_comb[sym]=1
            row_comb['disease_name']=key
            df = df.append(pd.Series(row_comb),ignore_index=True)
            

100%|██████████| 178/178 [21:00<00:00,  7.08s/it]  


In [23]:
df.shape

(14343, 416)

In [27]:
df.to_csv("disease_symptoms_dataset_comb.csv",index=None)