In [7]:
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.common.by import By 
from selenium.webdriver.chrome.options import Options

# Beautiful Soup 
from bs4 import BeautifulSoup

# Automates downloading and setting up of Chrome Driver
from webdriver_manager.chrome import ChromeDriverManager

# Custom Settings for the project 
import settings


In [30]:
def scrape_page(driver, current_url: str) -> list: 
    """    
    This functions scrapes a single webpage in a paginated website. 

    Args: 
        driver: a webdriver oject
        current_url: the url that will be visited by the driver

    Returns: 
        List of products scraped from a single page/current_url
    """ 

    driver.get(current_url) # Visit the web page 

    # wait for the page to fully load
    wait = WebDriverWait(driver, 4)
    wait.until(EC.presence_of_element_located((By.TAG_NAME, "body"))) 

    html_content = driver.page_source # Access the HTML source 
    
    soup = BeautifulSoup(html_content, "html.parser") # Parse using Beautiful Soup 

    products_wrapper = soup.find_all("div", {"class": "listing-item"}) # Access the products container 

    all_scrapped_products = list()

    # Loop through the products_wrapper to access each product cantainer
    # Then extract the prodcut information 
    for product in products_wrapper:
        product_name = product.find("p", {"class": "product-title"}).text 
        product_price = product.find("div", {"class": "product-price"}).text
        num_reviews = product.find("span", {"class": "reviews"}).text
        product_source = product.find("span", {"class": "tag-name"}).text # Where the product was sourced: Locally or Shipped
        
        # Wrap the produce in a dictionary and append to the list
        scrapped_product_dict = {
            "product_name": product_name,
            "product_price": product_price,
            "number_reviews": num_reviews,
            "product_source": product_source
        } 
        all_scrapped_products.append(scrapped_product_dict) 

    return all_scrapped_products



    

In [31]:

def main(BASE_URL: str, num_pages: str): 
    """   
    This is the main function that will be used to scrape the entire website. 

    Args: 
        BASE_URL: The url of the website to be scrapped. Contains the URL that will dynamically change due to pagination and search parameters such as product category 

        num_pages: The number of pages to be scrapped on the specified website.
     
    Returns: 
        A final list containing a dictionary of products scrapped from the entire website
    """ 

    # Set up the use of headless Chrome 
    chrome_browser_options = Options()
    chrome_browser_options.headless = True

    service = Service(ChromeDriverManager().install()) # Set up Chrome driver

    driver = webdriver.Chrome(service=service, options=chrome_browser_options) # Create a webdriver object

    current_page_nummber = 1 # Page Counter. Tracks the current web page being scrapped

    all_products = list() # list of dicts To store the scrapped products

    while current_page_nummber <= num_pages: 

        current_url = settings.BASE_URL.format("shoes", current_page_nummber) 

        current_products_sublist = scrape_page(driver, current_url) 
        print(current_products_sublist)
        current_page_nummber += 1

    driver.quit()

if __name__ == "__main__":
    main(settings.BASE_URL, 1)

[{'product_name': 'Cleaning Brush Automatic Liquid Discharge Soft Bristled Shoe Brush Long Handle Brush Household Cleaning Tool\n', 'product_price': 'KSh 299', 'number_reviews': '(42)', 'product_source': 'Fulfilled By Kilimall'}, {'product_name': "ASSKLO rubber flat shoes canvas shoes sports shoes classic women's shoes ladies shoes slip resistant breathable denim casual shoes women's running shoes student and girl sneakers fashion gifts", 'product_price': 'KSh 999', 'number_reviews': '(1191)', 'product_source': 'Fulfilled By Kilimall'}, {'product_name': "Size 41-45 Men's fashion sneakers mesh surface outdoor sports shoes Students comfortable  shoes Boys walking shoes hiking shoes running shoes Casual shoes", 'product_price': 'KSh 1,299', 'number_reviews': '(633)', 'product_source': 'Fulfilled By Kilimall'}, {'product_name': 'Air Cushion Men Sport Running Shoes Fashion Sneakers Jogging Trainers Casual Lace-up Breathable Mesh Outdoor', 'product_price': 'KSh 1,998', 'number_reviews': '(16

In [12]:

# Downloading and setting up Chrome Driver 
service = Service(ChromeDriverManager().install()) 

# initializing the webdriver 
driver = webdriver.Chrome(service=service) 

# Navigating to the website 
driver.get(BASE_URL) 

# Let the scrapper wait for 5 seconds to ensure page has fully loaded 
wait = WebDriverWait(driver, 5)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body"))) 

# Access the HTML content 
html_markup = driver.page_source 
soup = BeautifulSoup(html_markup, "html.parser") 

# Get all the tags containing the products listing 
products_wrapper = soup.find_all("div", {"class": "listing-item"})

# Loop through the products_wrapper 
for product in products_wrapper: 
    product_name = product.find("div", {"class": "product-price"}).text 

driver.quit()