In [71]:
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.common.by import By 
from selenium.webdriver.chrome.options import Options

from bs4 import BeautifulSoup

# Automates downloading and setting up of Chrome Driver
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
PRODUCT_CATEGORIES = {
    "167": "clothes",
    "1": "bags"
}

# URL for the website.
# product category and web page number will change on demand
BASE_URL = "https://www.kilimall.co.ke/category/{}?id={}&form=category&page={}"

In [72]:
def scrape_page(driver, current_url: str, product_category: str) -> list: 
    """    
    This functions scrapes a single webpage in a paginated website. 

    Args: 
        driver: a webdriver oject
        current_url: the url that will be visited by the driver

    Returns: 
        List of products scraped from a single page/current_url
    """ 

    driver.get(current_url) # Visit the web page 

    # wait for the page to fully load
    wait = WebDriverWait(driver, 4)
    wait.until(EC.presence_of_element_located((By.TAG_NAME, "body"))) 

    html_content = driver.page_source # Access the HTML source 
    
    soup = BeautifulSoup(html_content, "html.parser") # Parse using Beautiful Soup 

    products_wrapper = soup.find_all("div", {"class": "listing-item"}) # Access the products container 

    all_scrapped_products = list()

    # Loop through the products_wrapper to access each product cantainer
    # Then extract the prodcut information 
    for product in products_wrapper:

        product_name = product.find("p", {"class": "product-title"}).text 

        product_price = product.find("div", {"class": "product-price"}).text

        num_reviews = product.find("span", {"class": "reviews"}).text

        product_source = product.find("span", {"class": "tag-name"}).text # Where the product was sourced: Locally or Shipped
        
        # Wrap the produce in a dictionary and append to the list
        scrapped_product_dict = {
            "product_name": product_name,
            "product_price": product_price,
            "number_reviews": num_reviews,
            "product_category": product_category,
            "product_source": product_source,
        }
         
        all_scrapped_products.append(scrapped_product_dict) 

    return all_scrapped_products  

In [73]:
def main(BASE_URL: str, num_pages: str): 
    """   
    This is the main function that will be used to scrape the entire website. 

    Args: 
        BASE_URL: The url of the website to be scrapped. Contains the URL that will dynamically change due to pagination and search parameters such as product category 

        num_pages: The number of pages to be scrapped on the specified website.
     
    Returns: 
        A final list containing a dictionary of products scrapped from the entire website
    """ 

    # Set up the use of headless Chrome - No UI 
    chrome_browser_options = Options()
    chrome_browser_options.add_argument("--headless")

    service = Service(ChromeDriverManager().install()) # Set up Chrome driver

    driver = webdriver.Chrome(service=service, options=chrome_browser_options) # Create a webdriver object

    current_page_nummber = 1 # Page Counter. Tracks the current web page being scrapped

    all_products = list() # list of dicts To store the scrapped products

    for category_id, category_name in PRODUCT_CATEGORIES.items(): # Scrapping each product category at a time

        while current_page_nummber <= num_pages: # Ensure we don't access pages out of range

            current_url = BASE_URL.format(category_name, category_id, current_page_nummber) 

            current_products_sublist = scrape_page(driver, current_url, category_name) 

            all_products.extend(current_products_sublist) # Add this products to the final list of products

            current_page_nummber += 1 # Update the counter to next page

        current_page_nummber = 1 # Reset the count after switching to new product categories

    driver.quit() # Close the driver once done 

    return all_products

[{'product_name': '2pcs/Set Sexy lace Women Nightwear Nightgowns Nightdress Sleepwear Sleeping Night Dress Gowns Ladies Clothes', 'product_price': 'KSh 699', 'number_reviews': '(1646)', 'product_category': 'clothes', 'product_source': 'Fulfilled By Kilimall'}, {'product_name': 'Waist Trainer Slimming Belt Slim Body Shaper Corset Trimmer Sport Gym Fat Burner Elastic Shapewear Snug Fit Women Ladies Belly Slim Belt Band Body Building Sheath Flat Girdle Postpartum Control Wrap', 'product_price': 'KSh 379', 'number_reviews': '(5777)', 'product_category': 'clothes', 'product_source': 'Fulfilled By Kilimall'}, {'product_name': 'Waist Trainer Slimming Belt Slim Body Shaper Corset Trimmer Sport Gym Fat Burner Elastic Shapewear', 'product_price': 'KSh 259', 'number_reviews': '(1423)', 'product_category': 'clothes', 'product_source': 'Fulfilled By Kilimall'}, {'product_name': "SXCHEN New Men's Shorts Casual Sports Five-point Pants Loose Breathable Man Quick-drying Large Pants Beach Pants Knee Len

In [None]:
if __name__ == "__main__":
    products = main(BASE_URL, 1)
    print(products)