Importing the necessary libraries.

In [8]:
import csv
import numpy as np
import pandas as pd
from collections import defaultdict

Load the scraped dataset CSV file into a data frame.

In [2]:
CSV_FILEPATH = 'dataset/raw_data.csv'
data = pd.read_csv(CSV_FILEPATH)
data

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall
...,...,...,...
1861,,,UMLS:C0425251_bedridden^UMLS:C0741453_bedridden
1862,,,UMLS:C0242453_prostatism
1863,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0232257_systolic murmur
1864,,,UMLS:C0871754_frail


Filling in the null values using forward fill.

In [3]:
data = data.fillna(method='ffill')
data

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0392680_shortness of breath
2,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0012833_dizziness
3,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0004093_asthenia
4,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0085639_fall
...,...,...,...
1861,UMLS:C0233472_affect labile,45.0,UMLS:C0425251_bedridden^UMLS:C0741453_bedridden
1862,UMLS:C0233472_affect labile,45.0,UMLS:C0242453_prostatism
1863,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0232257_systolic murmur
1864,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0871754_frail


Preview the column names.

In [13]:
columns = list(data)
columns

['Disease', 'Count of Disease  Occurrence', 'Symptom']

Save column names to each of their own variable.

In [14]:
disease_col_name = columns[0]
count_col_name = columns[1]
symptom_col_name = columns[2]


Function to process the names of the entries.

In [6]:
def process_name(data):
  data_list = []
  data_name = data.replace('^', '_').split('_')
  n = 1
  for names in data_name:
    if n % 2 == 0:
      data_list.append(names)
    n += 1
  return data_list

Clean the names and segment the data frame into lists and dictionary.

In [15]:
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

for idx, row in data.iterrows():
    # Extract the Disease Names.
    if (row[disease_col_name] != "\xc2\xa0") and (row[disease_col_name] != ""):
        disease = row[disease_col_name]
        disease_list = process_name(data=disease)
        count = row[count_col_name]

    # Extract the Symptoms for each of the diseases.
    if (row[symptom_col_name] != "\xc2\xa0") and (row[symptom_col_name] != ""):
        symptom = row[symptom_col_name]
        symptom_list = process_name(data=symptom)
        for d in disease_list:
            for s in symptom_list:
                disease_symptom_dict[d].append(s)
            disease_symptom_count[d] = count

Write the cleaned data into a CSV file.

In [16]:
CSV_FILEPATH = 'dataset/dataset_clean.csv'

with open(CSV_FILEPATH, 'w') as csvfile:
  writer = csv.writer(csvfile)
  for key, value in disease_symptom_dict.items():
    for v in value:
      key = str.encode(key).decode('utf-8')
      writer.writerow([key, v, disease_symptom_count[key]])

Restructure the data frame.

In [18]:
columns = ['Disease', 'Symptom', 'Occurence']
data = pd.read_csv(CSV_FILEPATH, names=columns, encoding='ISO-8859-1')
data

Unnamed: 0,Disease,Symptom,Occurence
0,hypertensive disease,pain chest,3363.0
1,hypertensive disease,shortness of breath,3363.0
2,hypertensive disease,dizziness,3363.0
3,hypertensive disease,asthenia,3363.0
4,hypertensive disease,fall,3363.0
...,...,...,...
2125,affect labile,bedridden,45.0
2126,affect labile,prostatism,45.0
2127,decubitus ulcer,systolic murmur,42.0
2128,decubitus ulcer,frail,42.0


Resaved into proper CSV format.

In [19]:
data.to_csv(CSV_FILEPATH, index=False)

Acquire the unique diseases from the dataset and display the count.

In [22]:
unique_diseases = data['Disease'].unique()
print('Diseases Count: ', len(unique_diseases))

Diseases Count:  148


Display the list of diseases.

In [23]:
for disease in unique_diseases:
  print(disease)

hypertensive  disease
diabetes
depression  mental
depressive disorder
coronary  arteriosclerosis
coronary heart disease
pneumonia
failure  heart congestive
accident  cerebrovascular
asthma
myocardial  infarction
hypercholesterolemia
infection
infection  urinary tract
anemia
chronic  obstructive airway disease
dementia
insufficiency  renal
confusion
degenerative  polyarthritis
hypothyroidism
anxiety  state
malignant  neoplasms
primary malignant neoplasm
acquired  immuno-deficiency  syndrome
HIV
hiv infections
cellulitis
gastroesophageal  reflux disease
septicemia
systemic  infection
sepsis (invertebrate)
deep  vein thrombosis
dehydration
neoplasm
embolism  pulmonary
epilepsy
cardiomyopathy
chronic  kidney failure
carcinoma
hepatitis  C
peripheral  vascular disease
psychotic  disorder
hyperlipidemia
bipolar  disorder
obesity
ischemia
cirrhosis
exanthema
benign  prostatic hypertrophy
kidney  failure acute
mitral  valve insufficiency
arthritis
bronchitis
hemiparesis
osteoporosis
transient 

Acquire the unique symptoms from the dataset and display the count.

In [24]:
unique_symptoms = data['Symptom'].unique()
print('No. of symptoms: ', len(unique_symptoms))

No. of symptoms:  446


Display the list of symptoms.

In [25]:
for symptom in unique_symptoms:
  print(symptom)

pain  chest
shortness  of breath
dizziness
asthenia
fall
syncope
vertigo
sweat
sweating  increased
palpitation
nausea
angina  pectoris
pressure  chest
polyuria
polydypsia
orthopnea
rale
unresponsiveness
mental  status changes
vomiting
labored breathing
feeling  suicidal
suicidal
hallucinations  auditory
feeling  hopeless
weepiness
sleeplessness
motor  retardation
irritable  mood
blackout
mood  depressed
hallucinations  visual
worry
agitation
tremor
intoxication
verbal  auditory hallucinations
energy  increased
difficulty
nightmare
unable  to concentrate
homelessness
hypokinesia
dyspnea  on exertion
chest  tightness
cough
fever
decreased  translucency
productive  cough
pleuritic  pain
yellow  sputum
breath  sounds decreased
chill
rhonchus
green  sputum
non-productive  cough
wheezing
haemoptysis
distress  respiratory
tachypnea
malaise
night  sweat
jugular  venous distention
dyspnea
dysarthria
speech  slurred
facial  paresis
hemiplegia
seizure
numbness
symptom  aggravating factors
pain ch

Transform the list of symptoms into columns.

In [28]:
df_symptoms = pd.get_dummies(data["Symptom"])
df_symptoms

Unnamed: 0,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abnormally hard consistency.1,abortion,...,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum,yellow sputum.1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2125,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2126,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2127,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2128,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Place the list of diseases into a data frame with its own row.

In [30]:
df_diseases = data['Disease']
df_diseases

0       hypertensive  disease
1       hypertensive  disease
2       hypertensive  disease
3       hypertensive  disease
4       hypertensive  disease
                ...          
2125           affect  labile
2126           affect  labile
2127         decubitus  ulcer
2128         decubitus  ulcer
2129         decubitus  ulcer
Name: Disease, Length: 2130, dtype: object

Concatenate the two dataframes into one.

In [31]:
df = pd.concat([df_diseases, df_symptoms], axis=1)
df.drop_duplicates(keep='first', inplace=True)
df = df.groupby('Disease', sort=False).sum()
df = df.reset_index()
df

Unnamed: 0,Disease,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abnormally hard consistency.1,...,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum,yellow sputum.1
0,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,diabetes,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,depression mental,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,depressive disorder,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,coronary arteriosclerosis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,ileus,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
144,adhesion,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
145,delusion,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
146,affect labile,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Confirm the number of disease entries.

In [33]:
len(df)

148

Save the processed data.

In [36]:
CSV_FILEPATH = 'dataset/processed_data.csv'
df.to_csv(CSV_FILEPATH, index=False)