**Objective **                                                                  

This project involves web scraping product information from "aerocruisepilotshop.com", a specialized e-commerce site for aviation supplies. The goal is to gather structured data on various products across different categories, including Aircraft Parts, Avionics, Pilot Supplies, Pilot Training, and Safety and FBO

Such a project is highly useful for various stakeholders in the aviation and e-commerce sectors:

* Retail Businesses & E-commerce Stores: Especially in niche markets like
aviation supplies, for inventory management, competitive pricing strategies, product discovery, and purchasing.

* Market Researchers and Data Scientists: For gathering datasets for market analysis, trend identification, and building forecasting models for supply chain optimization [cite: 2025-03-25].

* Aviation Startups: For competitive tracking and identifying market opportunities.

* Plane Owners and Pilots: To find the best prices for parts, check availability, source rare items, and optimize purchases of supplies and training materials.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import urljoin

# Basic Setup
BASE_URL = "https://www.aerocruisepilotshop.com/"
# List of product categories we want to look at, with their web addresses and how many pages to check
CATEGORIES_CONFIG = [
    {"name": "Aircraft Parts", "slug": "product-category/aircraft-parts/", "max_pages": 2},
    {"name": "Avionics", "slug": "product-category/avionics/", "max_pages": 1},
    {"name": "Pilot Supplies", "slug": "product-category/pilot-supplies/", "max_pages": 3},
    {"name": "Pilot Training", "slug": "product-category/pilot-training/", "max_pages": 2},
    {"name": "Safety and FBO", "slug": "product-category/safety-and-fbo/", "max_pages": 1}
]

# Cleaning Up Prices
def clean_price(price_text):
    """
    This function takes the price text from the website and makes it easy to understand.
    It can handle prices that are just one number or a range of numbers.

    Args:
        price_text (str): The price as it appears on the website.

    Returns:
        float, str, or None: A single number as a float, a range as a string, or None if it can't figure it out.
    """
    if price_text:
        # Check if the price is a range (like "100 - 200")
        range_match = re.search(
            r'(\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s*(?:-|to)\s*(\d{1,3}(?:,\d{3})*(?:\.\d+)?)',
            price_text
        )

        if range_match:
            # Get the two numbers in the range and remove commas
            num1_str = range_match.group(1).replace(',', '')
            num2_str = range_match.group(2).replace(',', '')
            try:
                # Make sure they're valid numbers
                float(num1_str)
                float(num2_str)
                return f"{num1_str} - {num2_str}"
            except ValueError:
                print(f"Warning: Could not convert price range from '{price_text}'.")
                return None
        else:
            # If it's not a range, just find a single number
            single_match = re.search(r'\d{1,3}(?:,\d{3})*(?:\.\d+)?', price_text)
            if single_match:
                number_str = single_match.group(0).replace(',', '')
                try:
                    return float(number_str)
                except ValueError:
                    print(f"Warning: Could not convert price '{number_str}' from text '{price_text}'.")
                    return None
    return None

# Getting Web Page Content
def fetch_page_content(url, session):
    """
    This function goes to a web page and gets the information we need.

    Args:
        url (str): The web address to visit.
        session (requests.Session): The tool we use to visit the web page.

    Returns:
        BeautifulSoup object or None: The web page content or None if it couldn't get it.
    """
    try:
        response = session.get(url)
        response.raise_for_status()  # Check if the page loaded correctly
        return BeautifulSoup(response.content, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve page {url}. Error: {e}")
        return None

# Getting Details from a Product Page
def scrape_product_details(product_url, session):
    """
    This function looks at a product's page to find more details like the SKU and descriptions.

    Args:
        product_url (str): The web address of the product page.
        session (requests.Session): The tool we use to visit the web page.

    Returns:
        tuple: The SKU, description, and detailed specs, or 'N/A' if not found.
    """
    sku = 'N/A'
    description = 'N/A'
    detailed_specs = 'N/A'

    print(f"      -> Fetching details from: {product_url}")
    detail_soup = fetch_page_content(product_url, session)
    if not detail_soup:
        print(f"      Warning: Could not fetch details for {product_url}.")
        return sku, description, detailed_specs

    try:
        # Find the SKU
        sku_element = detail_soup.find('span', class_='sku')
        if sku_element:
            sku = sku_element.get_text(strip=True)

        # Find the short description
        description_element = detail_soup.find('div', class_='woocommerce-product-details__short-description')
        if description_element:
            description = description_element.get_text(separator=' ', strip=True)

        # Find the full specifications
        full_description_div = detail_soup.find('div', id='tab-description')
        if full_description_div:
            detailed_specs = full_description_div.get_text(separator=' ', strip=True)

    except Exception as e:
        print(f"      Error extracting details from {product_url}: {e}")

    return sku, description, detailed_specs

# Main Function to Get All Products
def scrape_aerocruise_pilot_shop():
    """
    This is the main function that goes through all the categories and gets product information.

    Returns:
        pandas.DataFrame: A table with all the product information we found.
    """
    print("--- Starting comprehensive web scraping for Aerocruise Pilot Shop ---")
    all_products_data = []

    with requests.Session() as session:
        # Pretend to be a web browser so we don't get blocked
        session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'})

        for category_info in CATEGORIES_CONFIG:
            category_name = category_info["name"]
            category_slug = category_info["slug"]
            max_pages = category_info["max_pages"]

            category_products_data = []

            print(f"\n--- Scraping category: {category_name} (up to {max_pages} pages) ---")
            current_category_base_url = urljoin(BASE_URL, category_slug)

            for page_num in range(1, max_pages + 1):
                url = f"{current_category_base_url}page/{page_num}/" if page_num > 1 else current_category_base_url
                print(f"  Attempting to scrape page {page_num}: {url}")

                soup = fetch_page_content(url, session)
                if not soup:
                    print(f"  Failed to load page {url}. Moving to next category.")
                    break

                product_containers = soup.find_all('li', class_='product')
                if not product_containers:
                    print(f"  No more products found on page {page_num} for {category_name}. Moving to next category.")
                    break

                for container in product_containers:
                    product_name = 'N/A'
                    product_url = 'N/A'
                    price = None
                    availability = 'N/A'
                    sku = 'N/A'
                    description = 'N/A'
                    detailed_specs = 'N/A'

                    try:
                        # Get the product name and URL
                        product_link_element = container.find('a', class_='woocommerce-LoopProduct-link')
                        if product_link_element:
                            relative_url = product_link_element.get('href')
                            product_url = urljoin(BASE_URL, relative_url)
                            title_element = product_link_element.find('h2', class_='woocommerce-loop-product__title')
                            if title_element:
                                product_name = title_element.get_text(strip=True)
                            elif product_link_element.text:
                                temp_name = product_link_element.get_text(separator=' ', strip=True)
                                product_name = re.sub(r'Add to cart|Select options|View options', '', temp_name).strip()
                                if not product_name:
                                    product_name = 'N/A - Name text empty'
                            else:
                                product_name = 'N/A - Name not found'
                        else:
                            print(f"    Warning: No main product link found for a container on page {page_num} in {category_name}.")
                            product_name = 'N/A - No link'
                            product_url = 'N/A'

                        # Get the price
                        price_element = container.find('span', class_='price')
                        if price_element:
                            sale_price_span = price_element.find('ins')
                            if sale_price_span:
                                amount_span = sale_price_span.find('span', class_='woocommerce-Price-amount')
                                price_text = amount_span.get_text(strip=True) if amount_span else sale_price_span.get_text(strip=True)
                            else:
                                regular_price_span = price_element.find('span', class_='woocommerce-Price-amount')
                                price_text = regular_price_span.get_text(strip=True) if regular_price_span else price_element.get_text(strip=True)
                            price = clean_price(price_text)

                        # Check if the product is in stock
                        out_of_stock_element = container.find('p', class_='stock out-of-stock')
                        availability = 'Out of Stock' if out_of_stock_element else 'In Stock'

                        # Get more details from the product page
                        if product_url != 'N/A':
                            sku, description, detailed_specs = scrape_product_details(product_url, session)
                            time.sleep(0.5)

                        product_data_entry = {
                            'Category': category_name,
                            'Product Name': product_name,
                            'Product URL': product_url,
                            'Price': price,
                            'Availability': availability,
                            'SKU/Part Number': sku,
                            'Description': description,
                            'Detailed Specs': detailed_specs
                        }

                        all_products_data.append(product_data_entry)
                        category_products_data.append(product_data_entry)

                    except Exception as e:
                        print(f"  Error processing a product on page {page_num} in {category_name}: {e}")
                        error_product_data_entry = {
                            'Category': category_name,
                            'Product Name': product_name,
                            'Product URL': product_url,
                            'Price': price,
                            'Availability': availability,
                            'SKU/Part Number': 'Error',
                            'Description': 'Error',
                            'Detailed Specs': 'Error'
                        }
                        all_products_data.append(error_product_data_entry)
                        category_products_data.append(error_product_data_entry)
                        continue

                time.sleep(1)

            # Save the data for this category to a CSV file
            if category_products_data:
                df_category = pd.DataFrame(category_products_data)
                clean_category_name_for_file = category_name.replace(" ", "_").replace("&", "and").lower()
                category_output_csv_file = f'aerocruise_{clean_category_name_for_file}_products.csv'
                try:
                    df_category.to_csv(category_output_csv_file, index=False, encoding='utf-8')
                    print(f"  Saved {len(df_category)} products for '{category_name}' to {category_output_csv_file}")
                except Exception as e:
                    print(f"  Error saving data for '{category_name}' to CSV: {e}")
            else:
                print(f"  No products scraped for '{category_name}'. Skipping individual CSV creation.")

    print("\n--- Comprehensive scraping complete! ---")
    return pd.DataFrame(all_products_data)

#  Execution Block
if __name__ == "__main__":
    df_all_products = scrape_aerocruise_pilot_shop()
    if not df_all_products.empty:
        output_csv_file = 'aerocruise_products_all_categories.csv'
        try:
            df_all_products.to_csv(output_csv_file, index=False, encoding='utf-8')
            print(f"\nSuccessfully saved {len(df_all_products)} products from ALL categories to {output_csv_file}")
            print("\nFirst 10 rows of the combined scraped data for verification:")
            print(df_all_products.head(10))
        except Exception as e:
            print(f"Error saving combined data to CSV: {e}")
    else:
        print("No data was scraped across all categories for the combined CSV.")


--- Starting comprehensive web scraping for Aerocruise Pilot Shop ---

--- Scraping category: Aircraft Parts (up to 2 pages) ---
  Attempting to scrape page 1: https://www.aerocruisepilotshop.com/product-category/aircraft-parts/
      -> Fetching details from: https://www.aerocruisepilotshop.com/product/36-yard-roll/
      -> Fetching details from: https://www.aerocruisepilotshop.com/product/bearing-needle/
      -> Fetching details from: https://www.aerocruisepilotshop.com/product/brake-lining/
      -> Fetching details from: https://www.aerocruisepilotshop.com/product/bushing/
Failed to retrieve page https://www.aerocruisepilotshop.com/product/bushing/. Error: 500 Server Error: Internal Server Error for url: https://www.aerocruisepilotshop.com/product/bushing/
      -> Fetching details from: https://www.aerocruisepilotshop.com/product/cessna-cowl-chafe-seal/
      -> Fetching details from: https://www.aerocruisepilotshop.com/product/cessna-strut-seal-kit/
      -> Fetching details fr