In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymongo
from selenium.webdriver.support.ui import WebDriverWait, Select
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import time

In [2]:
# Load database credentials from json file
with open('dbconfig.json') as config_file:
    config = json.load(config_file)

username = config['username']
password = config['password']
db_url = config['db_url']

# database string
CNX_STR = f"mongodb+srv://{username}:{password}@{db_url}/?connectTimeoutMS=50000"
client = pymongo.MongoClient(CNX_STR)

# Check if the database exists
if "manage2sail" in client.list_database_names():
    print("Database exists!")
    db = client.manage2sail
else:
    print("Database does not exist!")

Database exists!


In [3]:
def scrape_eventdetails(wait, driver):
    eventdetails = {}
    selector = '#details > table'
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, selector)))
    table = driver.find_element(By.CSS_SELECTOR, selector)
    for row in table.find_elements(By.TAG_NAME, 'tr'):
        columns = row.find_elements(By.TAG_NAME, 'td')
        if len(columns) >= 2:  # Ensure there are at least two columns
            key = ' '.join(columns[0].text.strip(': ').split())  # Clean key
            value = ' '.join(columns[1].text.split())  # Clean value
            value = value.replace('Add to calendar', '').strip()  # Remove "Add to calendar"
            eventdetails[key] = value  # Append each key-value pair directly to eventdetails

    return eventdetails


In [4]:
def scrape_resultdetails(wait, driver):
    try:
        result_details = {}
        
        # Selector for main result details text
        details_selector = '#results > div > div > div:nth-child(3) > div:nth-child(5) > div:nth-child(2) > div:nth-child(3) > div.resultInfo > div.pull-left.scoring-info'
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, details_selector)))
        result_info_element = driver.find_element(By.CSS_SELECTOR, details_selector)
        
        # Extract text and split by new lines
        lines = result_info_element.text.split('\n')
        for line in lines:
            if ':' in line:
                key, value = line.split(':', 1)
                result_details[' '.join(key.strip().split())] = ' '.join(value.strip().split())  # Clean key and value

        return result_details
    except Exception as e:
        return "No result details"



In [5]:
def scrape_publishdetails(wait, driver):
    try:
        publish_details = {}

        # Selector for publishing details text
        publishing_selector = '#results > div > div > div:nth-child(3) > div:nth-child(5) > div:nth-child(2) > div:nth-child(3) > div.resultInfo > div.pull-right.publishing-info'
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, publishing_selector)))
        publishing_info_element = driver.find_element(By.CSS_SELECTOR, publishing_selector)
        
        # Extract text and split by new lines
        lines = publishing_info_element.text.split('\n')
        for line in lines:
            if ':' in line:
                key, value = line.split(':', 1)
                publish_details[' '.join(key.strip().split())] = ' '.join(value.strip().split())  # Clean key and value
            else:
                publish_details['Type'] = ' '.join(line.strip().split())  # Clean type

        return publish_details
    except Exception as e:
        return "No publish details found"



In [6]:
def get_table_headers(driver, wait, header_selector):
    try:
        headers = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, header_selector)))
        header_texts = [th.text for th in headers.find_elements(By.TAG_NAME, 'th') if
                        'ng-hide' not in th.get_attribute('class')]
        return header_texts
    except Exception as e:
        #print(f"An error occurred while fetching headers: {e}")
        return []



In [7]:
def replace_column_keys(data, headers):
    new_data = []
    for record in data:
        new_record = {}
        for (key, value), header in zip(record.items(), headers):
            cleaned_value = ' '.join(value.split())  # Remove extra whitespaces and line breaks
            new_record[header] = cleaned_value
            if header.strip() == "N":  # Stop after the "N" column
                break
        new_data.append(new_record)
    return new_data



In [8]:
def scrape_eventresults(base_url, driver, wait):
    # Define result URL and open it with the WebDriver
    results = {}
    results_url = base_url + '#!/results'
    driver.get(results_url)

    # Define CSS selectors
    regatta_name_selector = '#results > div > div > div.regattaName'
    table_selector = '#results > div > div > div:nth-child(3) > div:nth-child(5) > div:nth-child(2) > div:nth-child(3) > table:nth-child(4)'
    header_selector = table_selector + ' > thead'

    # Check if the dropdown exists
    dropdown_selector = '#results > div > div > select'
    dropdown_elements = driver.find_elements(By.CSS_SELECTOR, dropdown_selector)

    if dropdown_elements:
        # Dropdown exists, process each option
        select = Select(dropdown_elements[0])
        option_texts = [option.text for option in select.options if option.text]

        if not option_texts:
            raise Exception("No options found in dropdown")

        for option_text in option_texts:
            try:
                select = Select(wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, dropdown_selector))))
                select.select_by_visible_text(option_text)
                wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, table_selector)))

                results[option_text] = {}
                table = driver.find_element(By.CSS_SELECTOR, table_selector)
                headers = get_table_headers(driver, wait, header_selector)
                if len(headers) == 0:
                    raise Exception("Header length is zero")

                row_data_list = []
                for row in table.find_elements(By.TAG_NAME, 'tr'):
                    columns = row.find_elements(By.TAG_NAME, 'td')
                    row_data = {f"column_{i}": col.text for i, col in enumerate(columns) if
                                'ng-hide' not in col.get_attribute('class')}
                    if any(value.strip() for value in row_data.values()):
                        row_data_list.append(row_data)

                results[option_text]['results'] = replace_column_keys(row_data_list, headers)

                # Scrape and store result details
                result_details = scrape_resultdetails(wait, driver)
                publish_details = scrape_publishdetails(wait, driver)
                results[option_text]['resultdetails'] = result_details
                results[option_text]['publishdetails'] = publish_details

            except Exception as e:
                results[option_text] = "no results"
    else:
        # No dropdown, scrape only the current page
        try:
            regatta_name = driver.find_element(By.CSS_SELECTOR, regatta_name_selector).text
            results[regatta_name] = {}
            table = driver.find_element(By.CSS_SELECTOR, table_selector)
            headers = get_table_headers(driver, wait, header_selector)
            if len(headers) == 0:
                raise Exception("Header length is zero")

            row_data_list = []
            for row in table.find_elements(By.TAG_NAME, 'tr'):
                columns = row.find_elements(By.TAG_NAME, 'td')
                row_data = {f"column_{i}": col.text for i, col in enumerate(columns) if
                            'ng-hide' not in col.get_attribute('class')}
                if any(value.strip() for value in row_data.values()):
                    row_data_list.append(row_data)

            results[regatta_name]['results'] = replace_column_keys(row_data_list, headers)

            # Scrape and store result details
            result_details = scrape_resultdetails(wait, driver)
            publish_details = scrape_publishdetails(wait, driver)
            results[regatta_name]['resultdetails'] = result_details
            results[regatta_name]['publishdetails'] = publish_details

        except Exception as e:
            results[regatta_name] = "no results"

    return results



In [9]:
def process_document(document):
    url = document['link']
    options = Options()
    service = Service('/usr/local/bin/chromedriver')
    driver = webdriver.Chrome(service=service, options=options)
    wait = WebDriverWait(driver, 5)

    try:
        driver.get(url)
        event_details = scrape_eventdetails(wait, driver)
        results = scrape_eventresults(url, driver, wait)
        return {'_id': document['_id'], 'eventdetails': event_details, 'resultsByClass': results}
    except Exception as e:
        return {'_id': document['_id'], 'error': str(e)}
    finally:
        driver.quit()

In [10]:
def process_document_with_retry(document, max_retries=3):
    retries = 0
    while retries < max_retries:
        try:
            result = process_document(document)  # Call your existing processing function
            return result
        except Exception as e:
            retries += 1
            print(f"Error processing document {document['_id']}: {e}. Retrying {retries}/{max_retries}...")
            time.sleep(1)  # Wait a bit before retrying
    return {'_id': document['_id'], 'error': 'Max retries exceeded'}

if __name__ == "__main__":
    # Setup WebDriver options
    options = Options()
    service = Service('/usr/local/bin/chromedriver')

    collection = db.events
    documents = list(collection.find())

    # Use ThreadPoolExecutor to run tasks concurrently
    max_workers = 25
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_document_with_retry, document) for document in documents]
        for future in as_completed(futures):
            result = future.result()
            if 'error' in result:
                print(f"An error occurred for document {result['_id']}: {result['error']}")
            else:
                update_result = collection.update_one(
                    {'_id': result['_id']},
                    {'$set': {'eventdetails': result['eventdetails'], 'resultsByClass': result['resultsByClass']}}
                )
                print(f"Document {result['_id']} updated successfully.")


Document 6679e81d6454fb6fd4724d57 updated successfully.
Document 6679e81d6454fb6fd4724ba6 updated successfully.
Document 6679e81d6454fb6fd4724b7e updated successfully.
Document 6679e81d6454fb6fd4724d5d updated successfully.
Document 6679e81d6454fb6fd4724b8c updated successfully.
Document 6679e81d6454fb6fd4724bd6 updated successfully.
Document 6679e81d6454fb6fd4724b81 updated successfully.
Document 6679e81d6454fb6fd4724d58 updated successfully.
Document 6679e81d6454fb6fd4724b89 updated successfully.
Document 6679e81d6454fb6fd4724bb1 updated successfully.
Document 6679e81d6454fb6fd4724b7b updated successfully.
Document 6679e81d6454fb6fd4724d3d updated successfully.
Document 6679e81d6454fb6fd4724d47 updated successfully.
Document 6679e81d6454fb6fd4724d55 updated successfully.
Document 6679e81d6454fb6fd4724d3f updated successfully.
Document 6679e81d6454fb6fd4724d5e updated successfully.
Document 6679e81d6454fb6fd4724b80 updated successfully.
Document 6679e81d6454fb6fd4724c49 updated succes