Importing the necessary libraries.

In [1]:
import csv
import numpy as np
import pandas as pd
from collections import defaultdict

Load the scraped dataset CSV file into a data frame.

In [2]:
CSV_FILEPATH = 'dataset/raw_data.csv'
data = pd.read_csv(CSV_FILEPATH)
data = data.replace({"  ": " "}, regex=True)
data

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall
...,...,...,...
1861,,,UMLS:C0425251_bedridden^UMLS:C0741453_bedridden
1862,,,UMLS:C0242453_prostatism
1863,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0232257_systolic murmur
1864,,,UMLS:C0871754_frail


Narrow the dataset to disease column.

In [3]:
data = data[['Disease']]
data

Unnamed: 0,Disease
0,UMLS:C0020538_hypertensive disease
1,
2,
3,
4,
...,...
1861,
1862,
1863,UMLS:C0011127_decubitus ulcer
1864,


Dropping the null values.

In [4]:
data = data.dropna()
data

Unnamed: 0,Disease
0,UMLS:C0020538_hypertensive disease
12,UMLS:C0011847_diabetes
26,UMLS:C0011570_depression mental^UMLS:C0011581_...
47,UMLS:C0010054_coronary arteriosclerosis^UMLS:C...
56,UMLS:C0032285_pneumonia
...,...
1806,UMLS:C1258215_ileus
1821,UMLS:C0001511_adhesion
1834,UMLS:C0011253_delusion
1855,UMLS:C0233472_affect labile


Function to process the names of the entries.

In [5]:
def process_name(data):
  data_list = []
  data_name = data.replace('^', '_').split('_')
  n = 1
  for names in data_name:
    if n % 1 == 0:
      data_list.append(names)
    n += 1
  return data_list

In [6]:
disease_umls_code_pair = []
for idx, row in data.iterrows():
    # Extract the Disease Names and UMLS Codes.
    if (row['Disease'] != "\xc2\xa0") and (row['Disease'] != ""):
        disease = row['Disease']
        disease_list = process_name(data=disease)

        disease_names = []
        disease_codes = []

        disease_names.extend(disease_list[1::2])
        disease_codes.extend(disease_list[::2])

        for disease, code in zip(disease_names, disease_codes):
            entry = {'umls': code.split(':')[1], 'disease': disease}
            disease_umls_code_pair.append(entry)

disease_umls_code_pair

[{'umls': 'C0020538', 'disease': 'hypertensive disease'},
 {'umls': 'C0011847', 'disease': 'diabetes'},
 {'umls': 'C0011570', 'disease': 'depression mental'},
 {'umls': 'C0011581', 'disease': 'depressive disorder'},
 {'umls': 'C0010054', 'disease': 'coronary arteriosclerosis'},
 {'umls': 'C0010068', 'disease': 'coronary heart disease'},
 {'umls': 'C0032285', 'disease': 'pneumonia'},
 {'umls': 'C0018802', 'disease': 'failure heart congestive'},
 {'umls': 'C0038454', 'disease': 'accident cerebrovascular'},
 {'umls': 'C0004096', 'disease': 'asthma'},
 {'umls': 'C0027051', 'disease': 'myocardial infarction'},
 {'umls': 'C0020443', 'disease': 'hypercholesterolemia'},
 {'umls': 'C0021311', 'disease': 'infection'},
 {'umls': 'C0042029', 'disease': 'infection urinary tract'},
 {'umls': 'C0002871', 'disease': 'anemia'},
 {'umls': 'C0024117', 'disease': 'chronic obstructive airway disease'},
 {'umls': 'C0497327', 'disease': 'dementia'},
 {'umls': 'C1565489', 'disease': 'insufficiency renal'},
 {

Create a dataframe for UMLS and disease name pairs.

In [7]:
disease_code_df = pd.DataFrame(disease_umls_code_pair)
disease_code_df

Unnamed: 0,umls,disease
0,C0020538,hypertensive disease
1,C0011847,diabetes
2,C0011570,depression mental
3,C0011581,depressive disorder
4,C0010054,coronary arteriosclerosis
...,...,...
144,C1258215,ileus
145,C0001511,adhesion
146,C0011253,delusion
147,C0233472,affect labile


Saved the processed data.

In [8]:
CSV_FILEPATH = 'dataset/disease-umls-code_pairs.csv'
disease_code_df.to_csv(CSV_FILEPATH, index=False)