In [1]:
import os

import requests
from bs4 import BeautifulSoup
import re
import json

In [2]:
os.chdir(os.path.join(os.path.expanduser("~"),'hypercardio'))

In [3]:
BASE_PATH = 'https://www.drugs.com'
EFFECT_MAPPING = {'int_3': 'Major','int_2': 'Moderate','int_0':'Unknown','int_1':'Minor' }
ID_REGEX = re.compile('(?<=\w-)\d+-\d+-\d+-\d+(?=\.html)')

In [4]:
with open('data/links_to_drugs.json','r') as fp:
    links_general = json.load(fp)

In [5]:
from functools import wraps

def write_file(filename: str, soup: BeautifulSoup) -> None:
    with open(os.path.join('data/scraped',filename),'w+') as fp:
        fp.write(str(soup))

def read_file(filename: str) -> BeautifulSoup:
    with open(os.path.join('data/scraped',filename),'r+') as fp:
        soup = BeautifulSoup(fp, 'html.parser')
    return soup


def log_soup(f):
    @wraps(f)
    def wrap(*args, **kwargs):
        url = kwargs['url']
        temp_url = url.replace('/','$')
        if temp_url not in os.listdir('data/scraped'):
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            write_file(temp_url,soup)
        else:
            soup = read_file(temp_url)
        result = f(*args, **kwargs,soup=soup)
        return result
    return wrap

In [6]:
def extract_value(line: str):
    values = re.findall(r'(?<=>)([\w\s]+)(?=<\/)',line)
    if values != []:
        return values
    line = BeautifulSoup(line.replace('(','[')).text
    if '[' in line:
        return re.search(r'^([\w\s]+)(?=(\[))',line).group().strip(' ')
    return line

In [7]:
def get_subtitle_info(soup: BeautifulSoup) -> dict:
    info = {}
    subtitle = (soup.find("p", {"class": "drug-subtitle"}))
    attributes_of_interest = ['Generic name:','Dosage form:','Drug class:']
    for line in str(subtitle).split('\n')[1:]:
        attribute = BeautifulSoup(line).find('b').text
        if attribute in attributes_of_interest:

            attribute_match = re.search(r'(?<=<b>).*(?=:</b>)',line)
            attribute = attribute_match.group()
            value_line = line[attribute_match.span()[1]+6:]
            info[attribute] = extract_value(value_line)
    return info

In [8]:
def get_sidebar_info(soup: BeautifulSoup) -> dict:

    sidebar_soup = (soup.find("div", {"id": "sidebar"}))
    availability = (sidebar_soup
                    .find('div',{'class':'ddc-accordion'})
                    .find('div',{'class':'ddc-accordion-heading'})
                    .find_all('span')[0].text)
    try:
        rating = (sidebar_soup
                        .find('div',{'class':'ddc-rating-summary'})
                  )
        score = rating.find('div',{'class':'ddc-rating-bar'}).find('span')['style']
        score = re.search(r'(?<=width: ).*(?=%;)',score).group()
        reviews = re.search(r'\d+',rating.find('em').text).group()
    except:
        score = None
        reviews = None
    info = {'availability':availability,
            'score':score,
            'reviews':reviews}
    return info

In [9]:
def get_description(soup: BeautifulSoup) -> dict:
    pre_desc_header = (soup.find("h2", {"id": "uses"}))
    description = pre_desc_header.findNext('p').text
    side_effects = soup.find('h2',{'id':'side-effects'}).findNext('ul').find_all('li')
    processed_side_effects = []
    for side_effect in side_effects:
        side_effect = side_effect.text.replace(';','').replace(' or','').strip(' ')
        side_effect = re.sub(r'-.+','',side_effect)
        processed_side_effects.append(side_effect.strip(' ').replace('.',''))
    return {'description':description,
            'side_effects': processed_side_effects}

In [10]:
def scrape_tab_urls(soup: BeautifulSoup) -> dict:
    tab_href = (soup
     .find_all('a',href=True))

    interactions_url = [i for i in tab_href if 'drug-interactions' in i['href']][0]
    return {'interactions_url': interactions_url['href'].replace('.html','-index.html')}


In [11]:
@log_soup
def get_general_info(soup: BeautifulSoup,url=None) -> dict:
    info = {}
    info.update(scrape_tab_urls(soup))
    info.update(get_subtitle_info(soup))
    info.update(get_sidebar_info(soup))
    info.update(get_description(soup))
    return info

In [12]:
@log_soup
def get_drugs_interactions(soup: BeautifulSoup, url=None) -> dict:
    medicine_interactions_soup =soup.find_all('ul',{'class':'interactions ddc-list-column-2'})
    medicine_interactions = []
    for index,text_box in enumerate(medicine_interactions_soup):
        for medicine_soup in text_box:
            if medicine_soup !='\n':
                if index != len(medicine_interactions_soup)-1: # last text_box contains informations about diseases interactions
                    inner_html = medicine_soup.find('a')
                    medicine_interactions.append({
                        'class': EFFECT_MAPPING[medicine_soup['class'][0]],
                        'name': inner_html.text,
                        'parent_id': '-'.join(ID_REGEX.search(inner_html['href']).group().split('-')[2:]),
                        'id': '-'.join(ID_REGEX.search(inner_html['href']).group().split('-')[:2])
                    })
    return medicine_interactions

@log_soup
def get_diseases_interactions(soup: BeautifulSoup, url=None) -> dict:
    diseases_interactions = []
    diseases_interactions_soup = (soup
        .find_all('ul',{'class':'interactions ddc-list-column-2'})[-1]
        .find_all('li'))
    for diseases_soup in diseases_interactions_soup:
        diseases_interactions.append({
            'name':diseases_soup.find('a').text,
            'class': EFFECT_MAPPING[diseases_soup['class'][0]]
        })

    return diseases_interactions


In [13]:
errors = []

In [14]:
from tqdm.auto import tqdm

In [15]:
import time
import numpy as np

In [16]:
general_info_list = []
drug_interactions_list = []
diseases_interactions_list = []
scraped_drugs = []
pbar = tqdm(links_general)
for link in pbar:
    if 'pro' not in links_general[link]:
        sleep_time = np.random.randint(0,10)/10
        try:
            general_info: dict = get_general_info(url=BASE_PATH+ links_general[link])
            drug_interactions: list = get_drugs_interactions(url=BASE_PATH+general_info['interactions_url'])
            diseases_interactions: list = get_diseases_interactions(url=BASE_PATH+general_info['interactions_url'])
            general_info_list.append(general_info)
            drug_interactions_list.append(drug_interactions)
            diseases_interactions_list.append(diseases_interactions)
            scraped_drugs.append(link)
        except:
            errors.append(link)
            pbar.set_postfix({'num_errors': len(errors)})

  0%|          | 0/1187 [00:00<?, ?it/s]

In [17]:
for i in range(len(scraped_drugs)):
    general_info_list[i]['drug name'] = scraped_drugs[i]

In [18]:
with open('data/general_info.json','w') as fp:
    json.dump(general_info_list,fp)

In [19]:
general_info_list

[{'interactions_url': '/drug-interactions/aripiprazole,abilify-index.html',
  'Generic name': ['aripiprazole'],
  'Drug class': ['Atypical antipsychotics'],
  'availability': 'Prescription only',
  'score': '59.8',
  'reviews': '1',
  'description': 'Abilify is an antipsychotic medication. It works by changing the actions of chemicals in the brain.',
  'side_effects': ['\nsevere agitation, distress, restless feeling\n',
   '\ntwitching uncontrollable movements of your eyes, lips, tongue, face, arms, legs\n',
   '\nmask\n',
   '\nseizure (convulsions)\n',
   '\nsevere nervous system reaction \n',
   '\nlow blood cell counts \n',
   '\nhigh blood sugar \n'],
  'drug name': 'Abilify'},
 {'interactions_url': '/drug-interactions/aripiprazole,abilify-maintena-index.html',
  'Generic name': ['aripiprazole'],
  'Dosage form': 'extended-release intramuscular injection',
  'Drug class': ['Atypical antipsychotics'],
  'availability': 'Prescription only',
  'score': '61.5',
  'reviews': '49',
  'd

In [20]:
len(scraped_drugs)

910

In [21]:
diseases_interactions_list_with_key = []
for i in range(len(scraped_drugs)):
    temp = {}
    temp['drug name'] = scraped_drugs[i]
    temp['diseases_interactions'] = diseases_interactions_list[i]
    diseases_interactions_list_with_key.append(temp)

In [22]:
with open('data/diseases_interactions.json','w') as fp:
    json.dump(diseases_interactions_list_with_key,fp)

In [23]:
drug_interactions_list_with_key = []
for i in range(len(scraped_drugs)):
    temp = {}
    temp['drug name'] = scraped_drugs[i]
    temp['drug_interactions'] = drug_interactions_list[i]
    drug_interactions_list_with_key.append(temp)

In [24]:
with open('data/drug_interactions.json','w') as fp:
    json.dump(drug_interactions_list_with_key,fp)

In [25]:
drug_interactions_list_with_key[0]

{'drug name': 'Abilify',
 'drug_interactions': [{'class': 'Moderate',
   'name': 'abametapir topical',
   'parent_id': '233-109',
   'id': '4171-0'},
  {'class': 'Moderate',
   'name': 'abarelix',
   'parent_id': '233-109',
   'id': '5-0'},
  {'class': 'Moderate',
   'name': 'abiraterone',
   'parent_id': '3321-0',
   'id': '233-109'},
  {'class': 'Moderate',
   'name': 'acarbose',
   'parent_id': '9-0',
   'id': '233-109'},
  {'class': 'Moderate',
   'name': 'acebutolol',
   'parent_id': '10-0',
   'id': '233-109'},
  {'class': 'Moderate',
   'name': 'acetohexamide',
   'parent_id': '93-0',
   'id': '233-109'},
  {'class': 'Moderate',
   'name': 'acetylcarbromal',
   'parent_id': '95-0',
   'id': '233-109'},
  {'class': 'Moderate',
   'name': 'acetylcholine ophthalmic',
   'parent_id': '96-0',
   'id': '233-109'},
  {'class': 'Moderate',
   'name': 'aclidinium',
   'parent_id': '3406-0',
   'id': '233-109'},
  {'class': 'Moderate',
   'name': 'acrivastine',
   'parent_id': '2875-0',
 

In [26]:
drugs_interactions_list

NameError: name 'drugs_interactions_list' is not defined

In [None]:
with open('data/drug_interactions.json','w') as fp:
    json.dump(drug_interactions_list_with_key,fp)

In [45]:
drug_interactions_list_with_key[0]

{'drug name': 'Abilify',
 'drug_interactions': [{'class': 'Moderate',
   'name': 'abametapir topical',
   'id': '233-109'},
  {'class': 'Moderate', 'name': 'abarelix', 'id': '233-109'},
  {'class': 'Moderate', 'name': 'abiraterone', 'id': '3321-0'},
  {'class': 'Moderate', 'name': 'acarbose', 'id': '9-0'},
  {'class': 'Moderate', 'name': 'acebutolol', 'id': '10-0'},
  {'class': 'Moderate', 'name': 'acetohexamide', 'id': '93-0'},
  {'class': 'Moderate', 'name': 'acetylcarbromal', 'id': '95-0'},
  {'class': 'Moderate', 'name': 'acetylcholine ophthalmic', 'id': '96-0'},
  {'class': 'Moderate', 'name': 'aclidinium', 'id': '3406-0'},
  {'class': 'Moderate', 'name': 'acrivastine', 'id': '2875-0'},
  {'class': 'Moderate', 'name': 'albiglutide', 'id': '3524-0'},
  {'class': 'Moderate', 'name': 'albuterol', 'id': '109-0'},
  {'class': 'Major', 'name': 'alfentanil', 'id': '118-0'},
  {'class': 'Moderate', 'name': 'alfuzosin', 'id': '119-0'},
  {'class': 'Moderate', 'name': 'alogliptin', 'id': '3

In [43]:
drugs_interactions_list

NameError: name 'drugs_interactions_list' is not defined