Importing the necessary libraries.

In [1]:
import csv
import numpy as np
import pandas as pd
from collections import defaultdict

Load the scraped dataset CSV file into a data frame.

In [2]:
CSV_FILEPATH = 'dataset/raw_data.csv'
data = pd.read_csv(CSV_FILEPATH)
data = data.replace({"  ": " "}, regex=True)
data

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall
...,...,...,...
1861,,,UMLS:C0425251_bedridden^UMLS:C0741453_bedridden
1862,,,UMLS:C0242453_prostatism
1863,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0232257_systolic murmur
1864,,,UMLS:C0871754_frail


Narrow the dataset to symptom column.

In [3]:
data = data[['Symptom']]
data

Unnamed: 0,Symptom
0,UMLS:C0008031_pain chest
1,UMLS:C0392680_shortness of breath
2,UMLS:C0012833_dizziness
3,UMLS:C0004093_asthenia
4,UMLS:C0085639_fall
...,...
1861,UMLS:C0425251_bedridden^UMLS:C0741453_bedridden
1862,UMLS:C0242453_prostatism
1863,UMLS:C0232257_systolic murmur
1864,UMLS:C0871754_frail


Dropping the null values.

In [4]:
data = data.dropna()
data

Unnamed: 0,Symptom
0,UMLS:C0008031_pain chest
1,UMLS:C0392680_shortness of breath
2,UMLS:C0012833_dizziness
3,UMLS:C0004093_asthenia
4,UMLS:C0085639_fall
...,...
1861,UMLS:C0425251_bedridden^UMLS:C0741453_bedridden
1862,UMLS:C0242453_prostatism
1863,UMLS:C0232257_systolic murmur
1864,UMLS:C0871754_frail


Function to process the names of the entries.

In [5]:
def process_name(data):
  data_list = []
  data_name = data.replace('^', '_').split('_')
  n = 1
  for names in data_name:
    if n % 1 == 0:
      data_list.append(names)
    n += 1
  return data_list

In [6]:
symptom_umls_code_pair = []
for idx, row in data.iterrows():
    # Extract the Symptom Names and UMLS Codes.
    if (row['Symptom'] != "\xc2\xa0") and (row['Symptom'] != ""):
        symptom = row['Symptom']
        symptom_list = process_name(data=symptom)

        symptom_names = []
        symptom_codes = []

        symptom_names.extend(symptom_list[1::2])
        symptom_codes.extend(symptom_list[::2])

        for symptom, code in zip(symptom_names, symptom_codes):
            entry = {'umls': code.split(':')[1], 'symptom': symptom}
            symptom_umls_code_pair.append(entry)

symptom_umls_code_pair

[{'umls': 'C0008031', 'symptom': 'pain chest'},
 {'umls': 'C0392680', 'symptom': 'shortness of breath'},
 {'umls': 'C0012833', 'symptom': 'dizziness'},
 {'umls': 'C0004093', 'symptom': 'asthenia'},
 {'umls': 'C0085639', 'symptom': 'fall'},
 {'umls': 'C0039070', 'symptom': 'syncope'},
 {'umls': 'C0042571', 'symptom': 'vertigo'},
 {'umls': 'C0038990', 'symptom': 'sweat'},
 {'umls': 'C0700590', 'symptom': 'sweating increased'},
 {'umls': 'C0030252', 'symptom': 'palpitation'},
 {'umls': 'C0027497', 'symptom': 'nausea'},
 {'umls': 'C0002962', 'symptom': 'angina pectoris'},
 {'umls': 'C0438716', 'symptom': 'pressure chest'},
 {'umls': 'C0032617', 'symptom': 'polyuria'},
 {'umls': 'C0085602', 'symptom': 'polydypsia'},
 {'umls': 'C0392680', 'symptom': 'shortness of breath'},
 {'umls': 'C0008031', 'symptom': 'pain chest'},
 {'umls': 'C0004093', 'symptom': 'asthenia'},
 {'umls': 'C0027497', 'symptom': 'nausea'},
 {'umls': 'C0085619', 'symptom': 'orthopnea'},
 {'umls': 'C0034642', 'symptom': 'ral

Create a dataframe for UMLS and symptom name pairs.

In [7]:
symptom_code_df = pd.DataFrame(symptom_umls_code_pair)
symptom_code_df

Unnamed: 0,umls,symptom
0,C0008031,pain chest
1,C0392680,shortness of breath
2,C0012833,dizziness
3,C0004093,asthenia
4,C0085639,fall
...,...,...
1902,C0741453,bedridden
1903,C0242453,prostatism
1904,C0232257,systolic murmur
1905,C0871754,frail


Saved the processed data.

In [8]:
CSV_FILEPATH = 'dataset/symptom-umls-code_pairs.csv'
symptom_code_df.to_csv(CSV_FILEPATH, index=False)