<a href="https://www.kaggle.com/code/neeradh/webmd-drug-specific-review-scraper?scriptVersionId=202907778" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import numpy as np
import regex as re

In [2]:
# Spoof headers to avoid being blocked by the server
headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'DNT': '1',  # Do Not Track
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'TE': 'Trailers'
}

In [3]:
# Define the columns for the DataFrame
cols = [
    'drug_name',
    'date',
    'age',
    'gender',
    'time_on_drug',
    'reviewer_type',
    'condition',
    'rating_overall',
    'rating_effectiveness',
    'rating_ease_of_use',
    'rating_satisfaction',
    'text'
    'thumbs_up'
    'thumbs_down'
]

In [4]:
# Create an empty DataFrame to store lisinopril reviews
reviews_df = pd.DataFrame(columns=cols)

In [5]:
# Define the URL for specifc drug
lisinopril_url = 'https://reviews.webmd.com/drugs/drugreview-6873-lisinopril-oral'

In [6]:
# Get soup from a URL
def get_soup(review_url, page, headers):
    curr_url = review_url + f'?page={page}'
    try:
        response = requests.get(curr_url, headers=headers, timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'lxml')
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {page}: {e}")
        return None

In [7]:
def regex_date(review):
    '''Parses the date of the review in format dd/mm/yyyy'''
    return re.findall(r'\d+/\d+/\d+', review.find('div', class_='date').text)[0]

def regex_condition(review):
    '''Parses the condition for which the medication is used'''
    condition_element = review.find('strong', class_='condition')
    condition_listed = condition_element is not None
    if condition_listed:
        # TODO: be able to match ''"Change of Life" Signs' condition
        condition_match = re.findall(r'(?<=Condition:\s)\w+(?:\s\w+)*', condition_element.text)
    return condition_match[0] if (condition_listed and len(condition_match) > 0) else np.nan

def regex_rating_overall(review):
    '''Parses the overall rating, the average of 3 categories'''
    rating_overall_line = review.find('div', class_='overall-rating').strong.text
    return re.findall(r'\d+.\d+', rating_overall_line)

def regex_rating_category(review, ind_cat):
    '''Parses the rating for the category at index ind_cat in ['effectiveness', 'ease_of_use', 'satisfaction']'''
    rating_categories = review.find('div', class_='categories').find_all('section')
    div = rating_categories[ind_cat].find('div', class_='webmd-rate on-mobile')
    return int(div.get('aria-valuenow'))

def regex_text(review):
    '''Parses the free response text review for the drug'''
    text_line = review.find('p', class_='description-text')
    return text_line.text if text_line is not None else np.nan
def regex_age(details):
    '''Parses the age of the medication user'''
    age_match = re.findall(r'(?<=\|\s+)\d+-\d+', details)
    return age_match[0] if len(age_match) > 0 else np.nan

def regex_gender(details):
    '''Parses the gender of the medication user'''
    gender_match = re.findall(r'(?<=\|\s+)Male|Female', details)
    return gender_match[0] if len(gender_match) > 0 else np.nan

def regex_time(details):
    '''Parses the duration of time on drug'''
    time_match = re.findall(r'(?<=On\smedication\sfor\s)\w+(?:\s\w+)*', details)
    return time_match[0] if len(time_match) > 0 else np.nan

def regex_reviewer(details):
    '''Parses the type of reviewer'''
    reviewer_match = re.findall(r'(?<=\|\s+)\w+(?:\s\w+)*(?=\s*$)', details)
    return reviewer_match[0] if len(reviewer_match) > 0 else np.nan

def regex_thumb_up(review):
    '''Parses the number of thumbs-up'''
    thumb_up_element = review.find('span', class_='likes')  # 'likes' class for thumbs-up count
    if thumb_up_element is not None:
        return int(thumb_up_element.text.strip())  # Strip spaces and convert to integer
    return 0

def regex_thumb_down(review):
    '''Parses the number of thumbs-down'''
    thumb_down_element = review.find('span', class_='dislikes')  # Assuming a 'dislikes' class for thumbs-down count
    if thumb_down_element is not None:
        return int(thumb_down_element.text.strip())  # Strip spaces and convert to integer
    return 0



In [8]:
# Parse individual review pages 
def parse_reviews_page(soup, reviews_df):
    reviews_html = soup.find_all('div', class_='review-details')  # find reviews on the page
    drug_name = 'Lisinopril'  # Hardcoding the drug name since it's only lisinopril

    for review in reviews_html:
        to_append = pd.DataFrame([pd.Series([None]*len(cols), index=cols)])
        
        details = review.find('div', class_='details').text

        # Fill in each column using your existing regex functions
        to_append['drug_name'] = drug_name
        to_append['date'] = regex_date(review)  
        to_append['age'] = regex_age(details)  
        to_append['gender'] = regex_gender(details)
        to_append['time_on_drug'] = regex_time(details)
        to_append['reviewer_type'] = regex_reviewer(details)
        to_append['condition'] = regex_condition(review)
        to_append['rating_overall'] = regex_rating_overall(review)
        to_append['thumbs_up'] = regex_thumb_up(review)
        to_append['thumbs_down'] = regex_thumb_down(review)


    
        for ind_cat, cat in enumerate(['effectiveness', 'ease_of_use', 'satisfaction']):
            to_append[f'rating_{cat}'] = regex_rating_category(review, ind_cat)

        to_append['review'] = regex_text(review)

        reviews_df = pd.concat([reviews_df, to_append], ignore_index=True)

    return reviews_df

In [9]:
# Crawl all review pages for lisinopril
def crawl_reviews_pages(review_url, reviews_df):
    soup = get_soup(review_url, 1, headers)
    if soup is None:
        print("Failed to fetch the first page. Aborting crawl.")
        return reviews_df

    pages = soup.find('ul', class_='pagination')
    
    # Check if pagination exists or it's a single page of reviews
    if pages is None:
        print("No pagination found, scraping a single page.")
        reviews_df = parse_reviews_page(soup, reviews_df)
        return reviews_df

    # Find total number of pages
    try:
        last_page = int(pages.find_all('li', class_='page-item')[-1].text.strip())
    except (ValueError, IndexError) as e:
        print(f"Failed to parse the total number of pages: {e}")
        return reviews_df

    # Iterate over each page
    for i in tqdm(range(1, last_page + 1)):
        soup = get_soup(review_url, i, headers)
        if soup is None:
            print(f"Skipping page {i} due to failed request.")
            continue
        reviews_df = parse_reviews_page(soup, reviews_df)

    return reviews_df

In [10]:
def compile_reviews(reviews_df, review_url):
    reviews_df = crawl_reviews_pages(review_url, reviews_df)
    reviews_df.to_csv(f'lisinopril_drug_webmd_reviews.csv', index=False)
    print("Lisinopril reviews saved to lisinopril_drug_webmd_reviews.csv")
    return reviews_df
reviews_df = compile_reviews(reviews_df, lisinopril_url)

100%|██████████| 217/217 [04:51<00:00,  1.34s/it]

Lisinopril reviews saved to lisinopril_drug_webmd_reviews.csv



