In [1]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Specify the path to your installed chromedriver executable
CHROMEDRIVER_PATH = '/Users/marclambertes/Python/chromedriver'
OPTA_URL = 'https://dataviz.theanalyst.com/opta-power-rankings/'

def scrape_opta_club_rankings():
    chrome_options = webdriver.ChromeOptions()
    
    # Optional: Add headless mode if you don't need to view the browser
    # chrome_options.add_argument("--headless")
    
    # Initialize the Chrome driver
    chrome_options.add_argument(f"webdriver.chrome.driver={CHROMEDRIVER_PATH}")
    driver = webdriver.Chrome(options=chrome_options)

    # Load the URL
    driver.get(OPTA_URL)

    # Wait for the page to load
    time.sleep(5)

    try:
        # Locate the "WOMENS" div using its text content and simulate a click
        womens_tab = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable((By.XPATH, "//div[text()='womens']"))
        )
        
        # Scroll to the WOMENS tab to make sure it's visible before clicking
        driver.execute_script("arguments[0].scrollIntoView(true);", womens_tab)
        time.sleep(1)  # Wait for scrolling to finish

        # Click the WOMENS tab
        womens_tab.click()
        print('Successfully switched to WOMENS rankings.')
        
        # Allow time for the table to load after switching to WOMENS
        time.sleep(3)  # Adjust this delay as necessary depending on load time

        # Wait for the table to be fully loaded
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, 'table'))
        )
    
    except Exception as e:
        print(f"Error occurred while trying to switch to WOMENS: {e}")
        driver.quit()
        return

    rows = []
    page_num = 1
    max_pages = 23  # Set the max number of pages to 23

    # Scrape multiple pages (up to 23)
    while page_num <= max_pages:
        print(f'Scraping page {page_num}')
        
        # Parse the page content
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        table = soup.find('table')

        # Ensure the table exists and extract headers only on the first page
        if page_num == 1:
            headers = [th.text.strip() for th in table.find_all('th')]
            headers.append('id')  # Add an id column for image id or other unique data if needed

        # Extract rows of data from the table
        for tr in table.find_all('tr'):
            row = [td.text.strip() for td in tr.find_all('td')]
            img = tr.select_one('img')
            img_id = img['src'].split('&id=')[-1] if img else ''  # Extract image ID if exists
            row.append(img_id)
            if row:
                rows.append(row)
        
        # Try to locate the ">" button and click it
        if page_num < max_pages:  # Only try clicking "Next" if we're below the max page count
            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//button[text()='>']"))
                )
                next_button.click()  # Click the ">" (Next) button
                time.sleep(3)  # Wait for the next page to load
            except:
                print(f"No more pages after page {page_num}")
                break
        
        page_num += 1

    print('Done scraping WOMENS club rankings.')
    driver.quit()
    time.sleep(1)
    
    # Save the data to a pandas DataFrame
    df = pd.DataFrame(rows, columns=headers)
    df.dropna(subset=['team'], inplace=True)

    # Save the DataFrame to an Excel file
    excel_filename = 'opta_club_rankings_womens_23112024.xlsx'
    df.to_excel(excel_filename, index=False)
    print(f'Data saved to {excel_filename}')

if __name__ == "__main__":
    scrape_opta_club_rankings()


Successfully switched to WOMENS rankings.
Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Scraping page 18
Scraping page 19
Scraping page 20
Scraping page 21
Scraping page 22
Scraping page 23
Done scraping WOMENS club rankings.
Data saved to opta_club_rankings_womens_23112024.xlsx
