In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

# Headers for the requests
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
}

# Define individual URLs and their corresponding categories
urls_and_categories = [
    (
        "https://www.yellowpages.com.au/find/plumbers-gas-fitters/canberra-act-2601",
        "Plumbers",
    ),
    (
        "https://www.yellowpages.com.au/find/electricians-electrical-contractors/sydney-nsw-2000",
        "Electricians",
    ),
    (
        "https://www.yellowpages.com.au/find/mechanics-motor-engineers/milton-nsw-2538",
        "Mechanics",
    ),
    (
        "https://www.yellowpages.com.au/find/hairdressers/melbourne-vic-3000",
        "Hairdressers",
    ),
    (
        "https://www.yellowpages.com.au/find/builders-building-contractors/brisbane-city-cbd-qld",
        "Builders",
    ),
    (
        "https://www.yellowpages.com.au/find/doctors-medical-practitioners/mittagong-nsw-2575",
        "Doctors",
    ),
    (
        "https://www.yellowpages.com.au/find/restaurants/townsville-qld-4810",
        "Restaurants",
    ),
    ("https://www.yellowpages.com.au/find/dentist/cairns-city-qld-4870", "Dentists"),
    (
        "https://www.yellowpages.com.au/find/lawyers-solicitors/gold-coast-qld",
        "Lawyers",
    ),
]


# Define a function to scrape data from a single page
def scrape_page(url):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        products = soup.find_all(
            "div",
            class_=[
                "Box__Div-sc-dws99b-0 iOfhmk MuiPaper-root MuiCard-root PaidListing MuiPaper-elevation1 MuiPaper-rounded",
                "Box__Div-sc-dws99b-0 iOfhmk MuiPaper-root MuiCard-root FreeListing MuiPaper-elevation1 MuiPaper-rounded",
            ],
        )
        data = []
        for product in products:
            name = product.find(
                "div", class_="Box__Div-sc-dws99b-0 dAyAhR"
            ).text.strip()
            work = product.find(
                "div", class_="Box__Div-sc-dws99b-0 bKFqNV"
            ).text.strip()
            number_element = product.find("div", class_="Box__Div-sc-dws99b-0 drWGzL")
            number = number_element.text.strip() if number_element else "N/A"
            data.append({"Company Name": name, "Work": work, "Phone Number": number})
        return data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return []


# Function to handle pagination and scrape all pages (up to 5 pages)
def scrape_all_pages(url):
    all_data = []
    page_num = 1
    while page_num <= 5:  # Limit to 5 pages
        current_url = f"{url}/page-{page_num}"
        page_data = scrape_page(current_url)
        if not page_data:
            break
        all_data.extend(page_data)
        page_num += 1
    return all_data


# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=5) as executor:
    results = list(executor.map(lambda x: scrape_all_pages(x[0]), urls_and_categories))

# Create a dictionary to store data for each category
all_data = {}
for (url, category), result in zip(urls_and_categories, results):
    if result:
        df = pd.DataFrame(result)
        all_data[category] = df

# Save all data to a single excel file
with pd.ExcelWriter("yellow_pages_data_all_pages.xlsx") as writer:
    for category, data in all_data.items():
        data.to_excel(writer, sheet_name=category, index=False)

print("Data scraped and saved successfully!")

Data scraped and saved successfully!
