Importing the necessary libraries.

In [1]:
import csv
import numpy as np
import pandas as pd
from collections import defaultdict
import requests
from bs4 import BeautifulSoup

Load the CSV for UMLS code and symptom pairs.

In [2]:
CSV_FILEPATH = 'dataset/disease-umls-code_pairs.csv'
data = pd.read_csv(CSV_FILEPATH)
data

Unnamed: 0,umls,disease
0,C0020538,hypertensive disease
1,C0011847,diabetes
2,C0011570,depression mental
3,C0011581,depressive disorder
4,C0010054,coronary arteriosclerosis
...,...,...
144,C1258215,ileus
145,C0001511,adhesion
146,C0011253,delusion
147,C0233472,affect labile


Extract the entries to lists.

In [3]:
disease_names = data['disease'].tolist()
disease_codes = data['umls'].tolist()

Scraping data from <a href="https://www.ncbi.nlm.nih.gov/medgen/">NCBI MedGen</a> for description on each disease based on UMLS code.

In [4]:
ROOT_URL = 'https://www.ncbi.nlm.nih.gov/medgen/'
disease_data = []

for disease_name, term in zip(disease_names, disease_codes):

    scraped_url = "{}?term={}".format(ROOT_URL, term)
    r = requests.get(scraped_url)

    soup = BeautifulSoup(r.content, 'html5lib')
    disease_info = soup.find("div", {"class": "portlet_content ln"})
    if disease_info:
        description = disease_info.text.replace('\xa0', ' ')
        if disease_info.find("a"):
            source_name = disease_info.find("a").text
            source_link = disease_info.find("a")['href']
        else:
            source_name = None
            source_link = None
    else:
        description = None
        source_name = None
        source_link = None

    entry = {
        "disease": disease_name,
        "code": term,
        "description": description,
        "source_name": source_name,
        "source_link": source_link,
        "root_url": scraped_url
    }

    disease_data.append(entry)


Preview of the generated dictionary from scraping.

In [5]:
disease_data

[{'disease': 'hypertensive disease',
  'code': 'C0020538',
  'description': 'The presence of chronic increased pressure in the systemic arterial system. [from HPO]',
  'source_name': 'HPO',
  'source_link': 'http://www.human-phenotype-ontology.org',
  'root_url': 'https://www.ncbi.nlm.nih.gov/medgen/?term=C0020538'},
 {'disease': 'diabetes',
  'code': 'C0011847',
  'description': None,
  'source_name': None,
  'source_link': None,
  'root_url': 'https://www.ncbi.nlm.nih.gov/medgen/?term=C0011847'},
 {'disease': 'depression mental',
  'code': 'C0011570',
  'description': None,
  'source_name': None,
  'source_link': None,
  'root_url': 'https://www.ncbi.nlm.nih.gov/medgen/?term=C0011570'},
 {'disease': 'depressive disorder',
  'code': 'C0011581',
  'description': 'Frequent feelings of being down, miserable, and/or hopeless; difficulty recovering from such moods; pessimism about the future; pervasive shame; feeling of inferior self-worth; thoughts of suicide and suicidal behavior. [from 

Converting the dictionary into a data frame.

In [6]:
disease_code_df = pd.DataFrame(disease_data)
disease_code_df

Unnamed: 0,disease,code,description,source_name,source_link,root_url
0,hypertensive disease,C0020538,The presence of chronic increased pressure in ...,HPO,http://www.human-phenotype-ontology.org,https://www.ncbi.nlm.nih.gov/medgen/?term=C002...
1,diabetes,C0011847,,,,https://www.ncbi.nlm.nih.gov/medgen/?term=C001...
2,depression mental,C0011570,,,,https://www.ncbi.nlm.nih.gov/medgen/?term=C001...
3,depressive disorder,C0011581,"Frequent feelings of being down, miserable, an...",HPO,http://www.human-phenotype-ontology.org,https://www.ncbi.nlm.nih.gov/medgen/?term=C001...
4,coronary arteriosclerosis,C0010054,Reduction of the diameter of the coronary arte...,HPO,http://www.human-phenotype-ontology.org,https://www.ncbi.nlm.nih.gov/medgen/?term=C001...
...,...,...,...,...,...,...
144,ileus,C1258215,Acute obstruction of the intestines preventing...,HPO,http://www.human-phenotype-ontology.org,https://www.ncbi.nlm.nih.gov/medgen/?term=C125...
145,adhesion,C0001511,A fibrous band of tissue that connects normall...,NCI,http://ncit.nci.nih.gov,https://www.ncbi.nlm.nih.gov/medgen/?term=C000...
146,delusion,C0011253,A false belief that is held despite evidence t...,HPO,http://www.human-phenotype-ontology.org,https://www.ncbi.nlm.nih.gov/medgen/?term=C001...
147,affect labile,C0233472,Emotional instability characterized by rapid a...,NCI,http://ncit.nci.nih.gov,https://www.ncbi.nlm.nih.gov/medgen/?term=C023...


Save the data frame into CSV.

In [7]:
CSV_FILEPATH = 'dataset/disease-description.csv'
disease_code_df.to_csv(CSV_FILEPATH, index=False)