In [1]:
# Import modules for regular expressions, time, file path matching, SQLite interaction, operating system interaction, HTML parsing, Selenium web scraping, and custom functions for database and web scraping tasks.
import re
import time
import platform
import glob
import sqlite3
import os
from lxml import html
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, WebDriverException, TimeoutException
from selenium.webdriver.support.ui import Select

# Import custom functions related to database operations
from db_functions_EA import *

# Import custom functions related to general web scraping tasks
from functions_EA import *



### Configuring Selenium with a SOCKS5 proxy, initializing, and navigating to a passport-related .onion URL.

In [2]:
# Common configuration for Firefox webdriver with SOCKS5 proxy
common_options = Options()
common_options.set_preference('network.proxy.type', 1)
common_options.set_preference('network.proxy.socks', '127.0.0.1')
common_options.set_preference('network.proxy.socks_port', 9150)
common_options.set_preference('network.proxy.socks_version', 5)
common_options.set_preference('network.proxy.socks_remote_dns', True)
common_options.set_preference('permissions.default.image', 2)  # deactivate the images on the browser

# Set up platform-specific Firefox profile path for Mac OS
profile_paths = glob.glob(os.path.expanduser("~/Library/Application Support/Firefox/Profiles/*.default-release"))

# Check if Firefox profiles are found
if not profile_paths:
    raise Exception("No Firefox profiles found.")

# Use the first profile path found
profile_path = profile_paths[0]

# Set the profile in the common options
common_options.set_preference('profile', profile_path)

# Create a Firefox WebDriver instance with the specified options
driver = webdriver.Firefox(options=common_options)

# Navigate to a specific .onion URL representing a product category page for passports
driver.get('http://3hoqqrwr3fnk32rkbrui2ir7ahfrwjgdzhk5i4twrvcoiv46nst5weyd.onion/product-category/passports/?products-per-page=all')


### Calls the custom function save_html() to capture and store the HTML content of the current web page in a file named 'first_page_client_code.html'.

In [3]:
# Save the HTML content of the current web page using a custom function
save_html(driver, 'first_page_client_code.html')

### Defines a screenshot() function to capture a full-page screenshot and saves it.

In [4]:
# Function to capture a screenshot of the web page using Selenium
def screenshot(driver, outputPath, width=None):
    # If width is not specified, set it to the document's scroll width
    if width == None:
        width = driver.execute_script('return document.documentElement.scrollWidth')
    
    # Get the maximum height of the document
    heightMax = driver.execute_script('return document.documentElement.scrollHeight')
    
    # Set the window size and capture the screenshot
    driver.set_window_size(width, heightMax)
    driver.save_screenshot(outputPath)

# Capture a screenshot of the current web page and save it as 'First_page_pass_EA.png'
screenshot(driver, 'First_page_pass_EA.png', None)


### Uses the lxml library to parse the HTML content from 'first_page_client_code.html' and assigns the root element to the variable html, facilitating information extraction.

In [5]:
# Parse HTML content from a file using lxml
html = lxml.html.parse('first_page_client_code.html').getroot()

### Calls the custom function initialize_database() to establish a database connection, assigning the resulting connection object to the variable connection.

In [6]:
# Initialize a database connection using a custom function
connection = initialize_database()

### Creates a database table, extracts a category from HTML, counts titles, and inserts the information into the database.

In [7]:
# Create a database table for categories, extract category information, count titles, and insert data into the database.
create_table_category(connection)
category_elements = html.find_class('trail-item trail-end')
category = category_elements[0].text_content() if category_elements else None
number_of_titles = len(html.find_class('title'))
insert_category(connection, category, number_of_titles)

### Creates a database table for sorting options, extracts sorting options from an HTML select element, and inserts them into the database if found. Silently passes without printing any message if no sorting options are found.

In [8]:
# Create a database table for sorting options
create_table_sorting_options(connection)

# Extract and insert sorting options from the HTML select element
sorting_options = html.find('.//select[@class="orderby hasCustomSelect"]')

# Check if sorting_options is not None before iterating
if sorting_options is not None:
    for sorting_option in sorting_options:
        sorting_text = sorting_option.text
        insert_sorting_options(connection, sorting_text)
else:
    # No sorting options found.
    pass


### Creates a directory, iterates sorting options, saves HTML code, and handles StaleElementReferenceException.

In [9]:
# Create the directory if it doesn't exist in the current folder
directory_name = "datasets/dataset_1/sorting_html"
create_or_check_directory(directory_name)

# Find the sorting options
sorting_options = driver.find_elements(By.XPATH, '//select[@class="orderby hasCustomSelect"]/option')
sorting_option_texts = [option.text for option in sorting_options]

# Loop through each sorting option text
for sorting_option_text in sorting_option_texts:
    try:
        # Click on the sorting option
        driver.find_element(By.XPATH, f'//select[@class="orderby hasCustomSelect"]/option[text()="{sorting_option_text}"]').click()

        time.sleep(10)  # Wait for the page to load

        filename = f'{directory_name}/{sorting_option_text}.html'
        # Use the save_html function to save the HTML code
        save_html(driver, filename)

    except StaleElementReferenceException:
        # Handle StaleElementReferenceException by refreshing elements
        continue

### Processes HTML files, collects product data with sorting options, and inserts into corresponding tables.

In [10]:
# Function to extract and collect product data using lxml
def collect_product_data(html_root):
    # Find all elements with the class "products oceanwp-row clr grid"
    products = html_root.find_class('products oceanwp-row clr grid')

    data = []

    for product_element in products:
        # Find elements with a specific class within each "products oceanwp-row clr grid" element
        passport_types = product_element.find_class('title')
        prices = product_element.find_class('price')
        links = product_element.xpath('.//li[@class="title"]/h2/a')

        # Iterate through both passport types, prices, and links simultaneously
        for passport_type, price, link in zip(passport_types, prices, links):
            data.append({
                'Passport Type': passport_type.text_content(),
                'Price': price.text_content(),
                'Link': link.get("href"),
            })

    # Dummy sorting_option_text for demonstration purposes
    sorting_option_text = "DummySortingOption"

    # Extract the sorting option
    sorting = html_root.find('.//select[@class="orderby hasCustomSelect"]')

    # Check if the sorting element is found
    if sorting is not None:
        selected_option = sorting.find('.//option[@selected="selected"]')

        # Check if the selected_option element is found
        if selected_option is not None:
            sorting_option_text = selected_option.text_content()
            sorting_option_text = re.sub(r'\W+', '_', sorting_option_text)
        else:
            # Handle the case where the selected option is not found
            sorting_option_text = "UnknownSortingOption"
    else:
        # Handle the case where the sorting element is not found
        sorting_option_text = "UnknownSortingOption" 

    return data, sorting_option_text
    
# Process HTML files and collect product data
html_files_data = process_html_files(directory_name)

# To store unique products and links
product_dict = {}  

# Iterate through each HTML file's data
for html_file, html_root in html_files_data.items():
    # Collect product data and sorting option
    product_data, sorting_option_text = collect_product_data(html_root)

    # Create a table for the current sorting option if it doesn't exist
    create_table_sorting_options_separate(connection, sorting_option_text)

    # Insert data into the corresponding table
    for item in product_data:
        insert_table_sorting_options_separate(connection, sorting_option_text, item)
    
        passport_type = item['Passport Type']
        link = item['Link']
        if passport_type not in product_dict:
            product_dict[passport_type] = link


### Creates a database table for different passport types, saves HTML files, and inserts data into the database.

In [11]:
# Creates a table for different passport types from the dropdown for each passport product, saves HTML files, and inserts data into a database.
create_table_passport_type(connection)

# Save directory for product HTML files
products_directory = "datasets/dataset_2/products_html"
create_or_check_directory(products_directory) 

# Save directory for product types HTML files
product_types_directory = "datasets/dataset_2/products_types_html"
create_or_check_directory(product_types_directory)

# Iterate through each product in product_dict
for product_name, link in product_dict.items():
    
    # Maximum number of retries
    max_retries = 3

    # Retry loop for driver.get(link)
    for _ in range(max_retries):
        try:
            # Visit the link using Selenium
            driver.get(link)
            
            # If successful, break out of the loop
            break
        except Exception as e:
            print(f"Error: {e}")
            print("Retrying...")
            time.sleep(10)  # Adjust the sleep duration based on your specific case


    # Create a filename based on the product name and save the HTML code
    filename = f"{products_directory}/{product_name}.html"
    save_html(driver, filename) 

    # Extract the title
    try:
        title_element = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, '//*[@class="single-post-title product_title entry-title"]'))
        )
        title = title_element.text
        
    except NoSuchElementException:
        print("Title not found. Reloading the page...")
        time.sleep(5)  # Adjust the sleep duration based on your specific case
        continue

    # Find the dropdown element by ID
    try:
        dropdown_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "type"))
        )
    except NoSuchElementException:
        continue

    # Get the list of option texts in the dropdown
    dropdown_option_texts = [option.text for option in dropdown_element.find_elements(By.TAG_NAME, "option")]

    # Loop through each option in the dropdown
    for dropdown_option_text in dropdown_option_texts:
        if dropdown_option_text != "Choose an option":
            # Select an option by visible text
            try:
                dropdown_element.click()
                driver.find_element(By.XPATH, f'//select[@id="type"]/option[text()="{dropdown_option_text}"]').click()
            except NoSuchElementException:
                continue

            # Wait for prices to update
            time.sleep(10)

            # Inside your loop where you find prices
            price = driver.find_elements(By.CLASS_NAME, 'woocommerce-variation.single_variation')[0].text

            # Insert data into the ProductDetails table
            insert_table_passport_type(connection, title, dropdown_option_text, price)

            # Extract and save HTML for each passport type
            filename = f"{product_types_directory}/{title}_{dropdown_option_text}.html"
            save_html(driver, filename)  


### Extracts title and description for each passport from HTML files, creates a database table, and inserts the data.

In [12]:

# Extracts title and description for each passport from HTML files, creates a table, and inserts data into a database.
def extract_title(html_root):
    # Find elements with the class "single-post-title product_title entry-title"
    titles = html_root.find_class('single-post-title product_title entry-title')
    title = titles[0].text_content() if titles else "N/A"
    return re.sub(r'\W+', '_', title)

def extract_description(html_root):
    # XPath to locate the description element
    descriptions = html_root.xpath('(//*[@id="tab-description"])[1]')
    if descriptions:
        # Extract and clean the description text
        description_text = descriptions[0].text_content().strip()
        result = re.sub(r'\n\s*\n', '\n', description_text)
        return result


# Create a table for descriptions if it doesn't exist
create_table_passport_description(connection)

# Specify the directory where your HTML files are located
directory_name = 'datasets/dataset_2/products_html'
create_or_check_directory(directory_name)

html_files_data = process_html_files(directory_name)

# Iterate through each HTML file
for _, html_root in html_files_data.items():
    # Extract the title from the HTML file
    passport_title = extract_title(html_root)

    # Extract other information
    description = extract_description(html_root)

    # Insert data into the Descriptions table
    insert_table_passport_description(connection, passport_title, description)


### Adds a product to the cart, saves HTML after adding to the cart, clicks "View Cart," and saves HTML for the cart view.

In [13]:
# Save directory for checkoutdata HTML
directory_name = 'datasets/dataset_3/checkout_html'
create_or_check_directory(directory_name)

# Locate and click the "Add to Cart" button
add_to_cart_button = WebDriverWait(driver, 20).until(
    EC.presence_of_element_located((By.XPATH, '//*[@class="single_add_to_cart_button button alt wp-element-button"]'))
)
add_to_cart_button.click()

# Save HTML after adding to cart
filename = f"{directory_name}/add_to_cart.html"
save_html(driver, filename)

# Locate and click the "View Cart" link
view_cart_link = WebDriverWait(driver, 20).until(
    EC.element_to_be_clickable((By.XPATH, '//a[@class="button wc-forward wp-element-button" and contains(text(), "View cart")]'))
)
view_cart_link.click()

# Save HTML for the cart view
filename = f"{directory_name}/bag.html"
save_html(driver, filename)



### Extracts cart total, creates a database table, inserts data, performs checkout, and saves HTML after checkout.

In [14]:

# Wait for the cart totals to be present
wait = WebDriverWait(driver, 10)
cart_totals_element = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'cart_totals')))

# Extract information using the driver
title_element = cart_totals_element.find_element(By.TAG_NAME, 'h2')
title = title_element.text.strip()

subtotals = cart_totals_element.find_elements(By.CLASS_NAME, 'cart-subtotal')
shipping_info = cart_totals_element.find_element(By.CLASS_NAME, 'woocommerce-shipping-totals')
total = cart_totals_element.find_element(By.CLASS_NAME, 'order-total')

# Display extracted information
for subtotal in subtotals:
    subtitle = subtotal.find_element(By.TAG_NAME, 'th').text.strip()
    price = subtotal.find_element(By.TAG_NAME, 'td').text.strip()

shipping_subtitle = shipping_info.find_element(By.TAG_NAME, 'th').text.strip()
shipping_details = shipping_info.find_element(By.TAG_NAME, 'td').text.strip()

total_subtitle = total.find_element(By.TAG_NAME, 'th').text.strip()
total_price = total.find_element(By.TAG_NAME, 'td').text.strip()

# Remove "CHANGE ADDRESS" from the output
total_subtitle = total_subtitle.replace("CHANGE ADDRESS", "")

# Create and insert data into the table
create_table_shipping_details(connection)
insert_table_shipping_details(connection, price, shipping_details, total_subtitle, total_price)

# Checkout
check_out_button = driver.find_element(By.XPATH, '//*[@class="wc-proceed-to-checkout"]')
check_out_button.click()
time.sleep(20)

# Save HTML after checkout
filename = f"{directory_name}/checkout.html"
save_html(driver, filename)

### Creates a database table and extracts input fields from a checkout HTML file, saving details and inserting data into the database.

In [15]:
# Creates a table in the database and extracts input fields from the checkout HTML, saving details and inserting data into the database.
create_table_billing_form_details(connection)

# Extract input fields from the checkout HTML
html_root = parse_single_html_file(directory_name, "checkout.html")
customer_details = html_root.xpath('//select[@id="customer_details"]')

# Find all input fields within both columns of the "customer_details" div
billing_fields = html_root.xpath('//div[@id="customer_details"]//div[contains(@class, "col-1") or contains(@class, "col-2")]//input | //div[@id="customer_details"]//div[contains(@class, "col-1") or contains(@class, "col-2")]//select')

# Find additional information fields specifically
additional_info_fields = html_root.xpath('//div[@id="customer_details"]//div[@class="col-2"]//p[@class="form-row notes"]//textarea')

input_fields = billing_fields + additional_info_fields

# Insert data into the billing details table
for field in input_fields:
    field_type = field.tag
    field_name = field.get('name', '')
    placeholder = field.get('placeholder', '')
    
    # Check if the "required" attribute is present
    is_required = 'required' in field.get('class', '')
    
    # Insert into the database
    insert_table_billing_form_details(connection, field_type, field_name, placeholder, is_required)


### Extracts countries to which the website can ship from the checkout HTML, to be iterated later for client code extraction.

In [16]:

# create shipping country table
create_table_shipping_country(connection)
    
# Find the checkout form
checkout_form = html_root.find('.//form[@class="checkout woocommerce-checkout"]')

# Find the country dropdown field
country_dropdown = checkout_form.find('.//select[@id="billing_country"]')

# Get all options within the dropdown
options = country_dropdown.findall('.//option')

# Find the "Free shipping" label text without clicking
free_shipping_label = driver.find_element(By.XPATH, '//label[@for="shipping_method_0_free_shipping1"]').text

# Extract information about each option
for option in options:
    # Get the text content of each option
    option_text = option.text_content()

    # insert shipping country into the table
    insert_table_shipping_country(connection, option_text, free_shipping_label)

### Fills in the billing form with provided information to place an order on the website and proceeds to payment options.

In [17]:
# Save directory for payment options HTML
directory_name = "datasets/dataset_3/payment_options_html"
create_or_check_directory(directory_name)

# Fill in the billing form to place an order on the website
billing_form = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.ID, 'customer_details'))
)

# Fill in the form with the provided information
billing_form.find_element(By.NAME, 'billing_first_name').send_keys("Ever")
billing_form.find_element(By.NAME, 'billing_last_name').send_keys("Green")
country_dropdown = Select(billing_form.find_element(By.NAME, 'billing_country'))
country_dropdown.select_by_visible_text("United States (US)")
billing_form.find_element(By.NAME, 'billing_address_1').send_keys("Evergreen Avenue")
billing_form.find_element(By.NAME, 'billing_postcode').send_keys("35004")
billing_form.find_element(By.NAME, 'billing_city').send_keys("Moody")
#state_dropdown = Select(billing_form.find_element(By.NAME, 'billing_state'))
#state_dropdown.select_by_visible_text("Alabama")
# Explicit wait for the state dropdown to be clickable
state_dropdown = WebDriverWait(billing_form, 10).until(
    EC.element_to_be_clickable((By.NAME, 'billing_state'))
)

# Select the state by visible text
state_dropdown = Select(state_dropdown)
state_dropdown.select_by_visible_text("Alabama")

billing_form.find_element(By.NAME, 'billing_phone').send_keys("54465465")
billing_form.find_element(By.NAME, 'billing_email').send_keys("tzhhu@dfg.com")

# Wait for the button to be clickable using XPath
time.sleep(20)
payment_option_button = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, '//button[@class="button alt wp-element-button"]'))
) 

# Click on the button to proceed to payment options
payment_option_button.click()


### Iterates through cryptocurrencies, extracts payment information, saves HTML content, and inserts data into the database.

In [18]:
# Waits for the page to load
time.sleep(15)

cryptocurrencies = [
    {"name": "Bitcoin", "symbol": "BTC", "data_attribute": "btc"},
    {"name": "Litecoin", "symbol": "LTC", "data_attribute": "ltc"},
    {"name": "Bitcoin Cash", "symbol": "BCH", "data_attribute": "bch"}
]

# Connects to the database and iterates through each cryptocurrency
create_table_payment_info(connection)

for crypto_info in cryptocurrencies:
    crypto_name, crypto_symbol, crypto_attribute = crypto_info["name"], crypto_info["symbol"], crypto_info["data_attribute"]

    # Waits for a short duration
    time.sleep(5)

    # Finds and clicks the cryptocurrency button
    crypto_button = driver.find_element(By.CSS_SELECTOR, f'div[data-cryptocurrency="{crypto_attribute}"]')
    crypto_button.click()

    # Waits for the page to load
    time.sleep(10)

    # Extracts cryptocurrency address and total amount
    address_element = driver.find_element(By.CSS_SELECTOR, '.bxc-pay-address .bxc-title')
    crypto_address = address_element.text.strip()

    total_amount_element = driver.find_element(By.CSS_SELECTOR, '.bxc-pay-amount .bxc-title')
    crypto_total_amount = total_amount_element.text.strip()

    # Saves HTML content to file
    directory_name = 'datasets/dataset_3/payment_options_html/'
    filename = f"{directory_name}/{crypto_name.replace(' ', '_')}.html"
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(driver.page_source)

    # Inserts data into the database
    payment_info = {
        'Summary': f"{crypto_name} ( {crypto_symbol} )",
        'CryptoAmount': f"{crypto_total_amount.split()[0]} {crypto_total_amount.split()[1]}" if len(crypto_total_amount.split()) >= 2 else 'N/A',
        'Address': crypto_address
    }
    insert_table_payment_info(connection, payment_info)

    # Finds and clicks the "Back" button
    back_button = driver.find_element(By.CSS_SELECTOR, '.bxc-btn.bxc-btn-border.bxc-back')
    back_button.click()

    time.sleep(5)

    # Finds and clicks the "Yes, I'm sure" button
    yes_sure_button = driver.find_element(By.ID, 'bxc-confirm-cancel')
    yes_sure_button.click()


In [19]:
# Quits the WebDriver, closing the browser
driver.quit()
