In [10]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import re  # Import re module for regex operations

# Selenium WebDriver setup
service = Service(
    r"C:\Users\Nishant shah\OneDrive\Desktop\Nishant\Software\chromedriver.exe"
)  # Replace with the path to your ChromeDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode for better performance
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
driver = webdriver.Chrome(service=service, options=options)

# List of links with corresponding sheet names
links = {
    "Anaesthetics": "https://www.finder.bupa.co.uk/Consultant/search/?order+=rel&ffeeAssured=1&fspecialityId%5B%5D=8&fgender%5B%5D=&fspokenLanguageId=&fhospitalNetworkId=&qk=&ql=&qlo=&qla=&qn=&order=&rpp=10&page=1&hospitalSwiftId=&fpathway=&giottoFormFlag_consultant=1#start",
    "Cardiology": "https://www.finder.bupa.co.uk/Consultant/search/?fspecialityId%5B%5D=30&qk=&ql=&qlo=&qla=&qn=&order=&rpp=10&page=1&hospitalSwiftId=&ffeeAssured=1&fexcovid=&fconsultantGroupId=&fhospitalNetworkId=&fpathway=&fgender%5B0%5D=&fhomeChemotherapy=&giottoFormFlag_consultant=1#start",
    "Trauma & Orthopaedic Surgery": "https://www.finder.bupa.co.uk/Consultant/search/?fspecialityId%5B%5D=153&qk=&ql=&qlo=&qla=&qn=&order=&rpp=10&page=1&hospitalSwiftId=&ffeeAssured=1&fexcovid=&fconsultantGroupId=&fhospitalNetworkId=&fpathway=&fgender%5B0%5D=&fhomeChemotherapy=&giottoFormFlag_consultant=1#start",
    "Physiotherapy": "https://www.finder.bupa.co.uk/Consultant/search/?fspecialityId%5B%5D=78&qk=&ql=&qlo=&qla=&qn=&order=&rpp=10&page=1&hospitalSwiftId=&ffeeAssured=1&fexcovid=&fconsultantGroupId=&fhospitalNetworkId=&fpathway=&fgender%5B0%5D=&fhomeChemotherapy=&giottoFormFlag_consultant=1#start",
}


# Function to extract data from the loaded HTML content
def extract_data(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    consultants = soup.find_all("article", class_="listing")
    data = []

    for consultant in consultants:
        name_tag = consultant.find("h2")
        name = name_tag.get_text(strip=True) if name_tag else "N/A"

        profession_tag = consultant.find_all("h3")
        profession = (
            re.sub(r"\s+", " ", profession_tag[-1].get_text(strip=True))
            if profession_tag
            else "N/A"
        )

        specialties_tag = consultant.find("div", class_="listing-about")
        specialties = (
            re.sub(r"\s+", " ", specialties_tag.find("p").get_text(strip=True))
            if specialties_tag and specialties_tag.find("p")
            else "N/A"
        )

        phone_tag = consultant.find("a", {"class": "phone"})
        phone = (
            phone_tag.get("data-tel-href").replace("tel:", "") if phone_tag else "N/A"
        )

        locations = []
        practices_list_tag = consultant.find("div", class_="practices-list")
        if practices_list_tag:
            practices = practices_list_tag.find_all("li")
            for practice in practices:
                locations.append(practice.get_text(strip=True))
        locations = ", ".join(locations)

        website_tag = consultant.find("a", {"class": "launch-website"})
        website = website_tag.get("href") if website_tag else "N/A"

        # Append cleaned data
        data.append(
            {
                "Name": name,
                "Profession": profession,  # Profession cleaned
                "Specialties": specialties,  # Specialties cleaned
                "Phone": phone,
                "Locations": locations,
                "Website": website,
            }
        )
    return data


# Function to scrape data for all `rpp` values up to 250
def scrape_with_rpp(url, max_rpp=250):
    all_data = []
    for rpp in range(10, max_rpp + 1, 10):  # Iterate through rpp = 10, 20, ..., 250
        updated_url = url.replace("rpp=10", f"rpp={rpp}")  # Update `rpp` in URL
        print(f"Scraping URL with rpp={rpp}: {updated_url}")
        driver.get(updated_url)
        time.sleep(3)  # Wait for page to load

        # Extract data from the current page
        html_content = driver.page_source
        data = extract_data(html_content)
        if not data:  # Break if no data is returned
            print(f"No data found for rpp={rpp}. Stopping further requests.")
            break
        all_data.extend(data)

    return all_data


# Main execution: Iterate through all links and save data to Excel
excel_writer = pd.ExcelWriter("consultants_data.xlsx", engine="xlsxwriter")

for sheet_name, link in links.items():
    print(f"Processing {sheet_name}...")
    extracted_data = scrape_with_rpp(link, max_rpp=250)
    df = pd.DataFrame(extracted_data)
    df.to_excel(excel_writer, sheet_name=sheet_name, index=False)

# Save the Excel file
excel_writer.close()
print("Data extraction completed. Saved to 'consultants_data.xlsx'.")

# Close the WebDriver
driver.quit()

Processing Anaesthetics...
Scraping URL with rpp=10: https://www.finder.bupa.co.uk/Consultant/search/?order+=rel&ffeeAssured=1&fspecialityId%5B%5D=8&fgender%5B%5D=&fspokenLanguageId=&fhospitalNetworkId=&qk=&ql=&qlo=&qla=&qn=&order=&rpp=10&page=1&hospitalSwiftId=&fpathway=&giottoFormFlag_consultant=1#start
Scraping URL with rpp=20: https://www.finder.bupa.co.uk/Consultant/search/?order+=rel&ffeeAssured=1&fspecialityId%5B%5D=8&fgender%5B%5D=&fspokenLanguageId=&fhospitalNetworkId=&qk=&ql=&qlo=&qla=&qn=&order=&rpp=20&page=1&hospitalSwiftId=&fpathway=&giottoFormFlag_consultant=1#start
Scraping URL with rpp=30: https://www.finder.bupa.co.uk/Consultant/search/?order+=rel&ffeeAssured=1&fspecialityId%5B%5D=8&fgender%5B%5D=&fspokenLanguageId=&fhospitalNetworkId=&qk=&ql=&qlo=&qla=&qn=&order=&rpp=30&page=1&hospitalSwiftId=&fpathway=&giottoFormFlag_consultant=1#start
Scraping URL with rpp=40: https://www.finder.bupa.co.uk/Consultant/search/?order+=rel&ffeeAssured=1&fspecialityId%5B%5D=8&fgender%5B%