Importing the necessary libraries.

In [1]:
import csv
import numpy as np
import pandas as pd
from collections import defaultdict

Load the scraped dataset CSV file into a data frame.

In [2]:
CSV_FILEPATH = 'dataset/raw_data.csv'
data = pd.read_csv(CSV_FILEPATH)
data = data.replace({"  ": " "}, regex=True)
data

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall
...,...,...,...
1898,,,UMLS:C0027497_nausea
1899,,,UMLS:C0042963_vomiting
1900,,,UMLS:C0015672_fatigue
1901,,,UMLS:C0085593_chill


Filling in the null values using forward fill.

In [3]:
data = data.fillna(method='ffill')
data

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0392680_shortness of breath
2,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0012833_dizziness
3,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0004093_asthenia
4,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0085639_fall
...,...,...,...
1898,UMLS:C0041327_pulmonary tuberculosis,42.0,UMLS:C0027497_nausea
1899,UMLS:C0041327_pulmonary tuberculosis,42.0,UMLS:C0042963_vomiting
1900,UMLS:C0041327_pulmonary tuberculosis,42.0,UMLS:C0015672_fatigue
1901,UMLS:C0041327_pulmonary tuberculosis,42.0,UMLS:C0085593_chill


Preview the column names.

In [4]:
columns = list(data)
columns

['Disease', 'Count of Disease  Occurrence', 'Symptom']

Save column names to each of their own variable.

In [5]:
disease_col_name = columns[0]
count_col_name = columns[1]
symptom_col_name = columns[2]


Function to process the names of the entries.

In [6]:
def process_name(data):
  data_list = []
  data_name = data.replace('^', '_').split('_')
  n = 1
  for names in data_name:
    if n % 2 == 0:
      data_list.append(names)
    n += 1
  return data_list

Clean the names and segment the data frame into lists and dictionary.

In [7]:
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

for idx, row in data.iterrows():
    # Extract the Disease Names.
    if (row[disease_col_name] != "\xc2\xa0") and (row[disease_col_name] != ""):
        disease = row[disease_col_name]
        disease_list = process_name(data=disease)
        count = row[count_col_name]

    # Extract the Symptoms for each of the diseases.
    if (row[symptom_col_name] != "\xc2\xa0") and (row[symptom_col_name] != ""):
        symptom = row[symptom_col_name]
        symptom_list = process_name(data=symptom)
        for d in disease_list:
            for s in symptom_list:
                disease_symptom_dict[d].append(s)
            disease_symptom_count[d] = count

Write the cleaned data into a CSV file.

In [8]:
CSV_FILEPATH = 'dataset/dataset_clean.csv'

with open(CSV_FILEPATH, 'w') as csvfile:
  writer = csv.writer(csvfile)
  for key, value in disease_symptom_dict.items():
    for v in value:
      key = str.encode(key).decode('utf-8')
      writer.writerow([key, v, disease_symptom_count[key]])

Restructure the data frame.

In [9]:
columns = ['Disease', 'Symptom', 'Occurence']
data = pd.read_csv(CSV_FILEPATH, names=columns, encoding='ISO-8859-1')
data

Unnamed: 0,Disease,Symptom,Occurence
0,hypertensive disease,pain chest,3363.0
1,hypertensive disease,shortness of breath,3363.0
2,hypertensive disease,dizziness,3363.0
3,hypertensive disease,asthenia,3363.0
4,hypertensive disease,fall,3363.0
...,...,...,...
2164,pulmonary tuberculosis,nausea,42.0
2165,pulmonary tuberculosis,vomiting,42.0
2166,pulmonary tuberculosis,fatigue,42.0
2167,pulmonary tuberculosis,chill,42.0


Resaved into proper CSV format.

In [10]:
data.to_csv(CSV_FILEPATH, index=False)

Acquire the unique diseases from the dataset and display the count.

In [11]:
unique_diseases = data['Disease'].unique()
print('Diseases Count: ', len(unique_diseases))

Diseases Count:  152


Display the list of diseases.

In [12]:
for disease in unique_diseases:
  print(disease)

hypertensive disease
diabetes
depression mental
depressive disorder
coronary arteriosclerosis
coronary heart disease
pneumonia
failure heart congestive
accident cerebrovascular
asthma
myocardial infarction
hypercholesterolemia
infection
infection urinary tract
anemia
chronic obstructive airway disease
dementia
insufficiency renal
confusion
degenerative polyarthritis
hypothyroidism
anxiety state
malignant neoplasms
primary malignant neoplasm
acquired immuno-deficiency syndrome
HIV
hiv infections
cellulitis
gastroesophageal reflux disease
septicemia
systemic infection
sepsis (invertebrate)
deep vein thrombosis
dehydration
neoplasm
embolism pulmonary
epilepsy
cardiomyopathy
chronic kidney failure
carcinoma
hepatitis C
peripheral vascular disease
psychotic disorder
hyperlipidemia
bipolar disorder
obesity
ischemia
cirrhosis
exanthema
benign prostatic hypertrophy
kidney failure acute
mitral valve insufficiency
arthritis
bronchitis
hemiparesis
osteoporosis
transient ischemic attack
adenocarci

Acquire the unique symptoms from the dataset and display the count.

In [13]:
unique_symptoms = data['Symptom'].unique()
print('No. of symptoms: ', len(unique_symptoms))

No. of symptoms:  405


Display the list of symptoms.

In [14]:
for symptom in unique_symptoms:
  print(symptom)

pain chest
shortness of breath
dizziness
asthenia
fall
syncope
vertigo
sweat
sweating increased
palpitation
nausea
angina pectoris
pressure chest
polyuria
polydypsia
orthopnea
rale
unresponsiveness
mental status changes
vomiting
labored breathing
feeling suicidal
suicidal
hallucinations auditory
feeling hopeless
weepiness
sleeplessness
motor retardation
irritable mood
blackout
mood depressed
hallucinations visual
worry
agitation
tremor
intoxication
verbal auditory hallucinations
energy increased
difficulty
nightmare
unable to concentrate
homelessness
hypokinesia
dyspnea on exertion
chest tightness
cough
fever
decreased translucency
productive cough
pleuritic pain
yellow sputum
breath sounds decreased
chill
rhonchus
green sputum
non-productive cough
wheezing
haemoptysis
distress respiratory
tachypnea
malaise
night sweat
jugular venous distention
dyspnea
dysarthria
speech slurred
facial paresis
hemiplegia
seizure
numbness
symptom aggravating factors
st segment elevation
st segment depres

Transform the list of symptoms into columns.

In [15]:
df_symptoms = pd.get_dummies(data["Symptom"])
df_symptoms

Unnamed: 0,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,abscess bacterial,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2164,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2165,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2166,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2167,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Place the list of diseases into a data frame with its own row.

In [16]:
df_diseases = data['Disease']
df_diseases

0         hypertensive disease
1         hypertensive disease
2         hypertensive disease
3         hypertensive disease
4         hypertensive disease
                 ...          
2164    pulmonary tuberculosis
2165    pulmonary tuberculosis
2166    pulmonary tuberculosis
2167    pulmonary tuberculosis
2168    pulmonary tuberculosis
Name: Disease, Length: 2169, dtype: object

Concatenate the two dataframes into one.

In [17]:
df = pd.concat([df_diseases, df_symptoms], axis=1)
df.drop_duplicates(keep='first', inplace=True)
df = df.groupby('Disease', sort=False).sum()
df = df.reset_index()
df

Unnamed: 0,Disease,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,diabetes,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,depression mental,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
3,depressive disorder,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,coronary arteriosclerosis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,decubitus ulcer,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
148,polyneuropathy,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
149,refractive error,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
150,acute nasopharyngitis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Confirm the number of disease entries.

In [18]:
len(df)

152

Save the processed data.

In [19]:
CSV_FILEPATH = 'dataset/processed_data.csv'
df.to_csv(CSV_FILEPATH, index=False)