In [1]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

# Specify the path to your installed chromedriver executable
CHROMEDRIVER_PATH = '/Users/marclambertes/Python/chromedriver'
MAX_OPTA_PAGE_NUM = 30
OPTA_URL = 'https://dataviz.theanalyst.com/opta-power-rankings/'

def scrape_opta_club_rankings():
    chrome_options = webdriver.ChromeOptions()    
    options = [
        '--headless',
    ]

    for option in options:
        chrome_options.add_argument(option)

    # Use the directly installed Chromedriver
    chrome_options.add_argument(f"webdriver.chrome.driver={CHROMEDRIVER_PATH}")
    driver = webdriver.Chrome(options=chrome_options)

    driver.get(OPTA_URL)
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    table = soup.find('table')
    headers = [th.text.strip() for th in table.find_all('th')]
    headers = headers + ['id']

    rows = []
    page_num = 1

    while page_num <= MAX_OPTA_PAGE_NUM:
        print(f'Scraping page {page_num}')
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        table = soup.find('table')

        for tr in table.find_all('tr'):
            row = [td.text.strip() for td in tr.find_all('td')]
            img = tr.select_one('img')
            if img is None:
                img_id = ''
            else:
                img_id = img['src'].split('&id=')[-1]
            row.append(img_id)
            if row:
                rows.append(row)
        
        if page_num < MAX_OPTA_PAGE_NUM:
            buttons = driver.find_elements(by=By.CSS_SELECTOR, value='button')
            last_button = buttons[-1]
            last_button.click()
            
        time.sleep(2)
        page_num += 1

    print('Done scraping Opta club rankings.')
    driver.quit()
    time.sleep(1)
    
    df = pd.DataFrame(rows, columns=headers)
    df.dropna(subset=['team'], inplace=True)

    # Save data to Excel file
    excel_filename = 'opta_club_rankings_04122024.xlsx'
    df.to_excel(excel_filename, index=False)
    print(f'Data saved to {excel_filename}')

if __name__ == "__main__":
    scrape_opta_club_rankings()


Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Scraping page 18
Scraping page 19
Scraping page 20
Scraping page 21
Scraping page 22
Scraping page 23
Scraping page 24
Scraping page 25
Scraping page 26
Scraping page 27
Scraping page 28
Scraping page 29
Scraping page 30
Done scraping Opta club rankings.
Data saved to opta_club_rankings_04122024.xlsx


In [2]:
import pandas as pd

# Step 1: Load the Excel file into a DataFrame
df = pd.read_excel('opta_club_rankings_21092024.xlsx')

# Step 2: Sort the DataFrame by Rating in descending order (highest ratings first)
df_sorted = df.sort_values(by='rating', ascending=False)

# Step 3: Create a new column 'Tier' with 10 tiers, where the highest ratings are in Tier 1
df_sorted['Tier'] = pd.qcut(df_sorted['rating'], 10, labels=False)

# Invert the tiers so that 0 becomes 10, 1 becomes 9, ..., and 9 becomes 1
df_sorted['Tier'] = 10 - df_sorted['Tier']

# Optional: Sort the DataFrame by the Tier column if needed
df_sorted = df_sorted.sort_values(by='Tier')

# Step 4: Save the updated DataFrame back to an Excel file (optional)
df_sorted.to_excel('your_file_with_tiers.xlsx', index=False)

# Display the first few rows to check
print(df_sorted.head())


     rank               team  rating  ranking change 7 days  \
0       1    Manchester City   100.0                      0   
201   202  Raków Częstochowa    79.3                     20   
200   201         Moreirense    79.3                    -23   
199   200      Vasco da Gama    79.3                      4   
198   199            Levante    79.3                      7   

                            id  Tier  
0    a3nyxabgsqlnqfkeg41m6tnpp     1  
201  1sxaf8l7fknmucd72fdretogm     1  
200  4a3yqn3kt1l18oklr7zxo4f1s     1  
199  5ponlslulpugdlvd93n9yqu2b     1  
198  4grc9qgcvusllap8h5j6gc5h5     1  


In [3]:
import pandas as pd
from tabulate import tabulate

df = pd.read_excel('opta_club_rankings_21092024.xlsx')


# Assuming you have a DataFrame named df containing the team data

# Enter the team name
team_name = input("Enter the team name: ")

# Filter the DataFrame to get the data for the specified team
team_data = df[df['team'] == team_name]

# Get the rating and ranking of the input team
input_team_rating = team_data['rating'].values[0]
input_team_ranking = team_data['rank'].values[0]

# Print the rating and ranking for the input team
print("Input Team:")
print(f"Team: {team_name}")
print(f"Rating: {input_team_rating}")
print(f"Rank: {input_team_ranking}")
print()

# Calculate the absolute difference between the input team's rating and the ratings of all other teams
df['rating_difference'] = abs(df['rating'] - input_team_rating)

# Sort the DataFrame by the rating difference in ascending order
sorted_teams = df.sort_values('rating_difference')

# Get the 15 teams closest to the input team based on ranking
closest_teams = sorted_teams.iloc[1:16]

# Sort the closest_teams DataFrame based on the "ranking" column
closest_teams_sorted = closest_teams.sort_values('rank')

# Create a new DataFrame with the closest teams, including their rating and ranking
team_data = closest_teams_sorted[['team', 'rating', 'rank']]

# Convert the DataFrame to a table format
table = tabulate(team_data, headers='keys', tablefmt='presto')

# Print the table
print(table)

Enter the team name: Ipswich Town
Input Team:
Team: Ipswich Town
Rating: 80.6
Rank: 152

     | team           |   rating |   rank
-----+----------------+----------+--------
 144 | Leicester City |     80.7 |    145
 145 | Cincinnati     |     80.7 |    146
 146 | Lech Poznań    |     80.7 |    147
 147 | Nordsjælland   |     80.6 |    148
 148 | Fluminense     |     80.6 |    149
 149 | Leeds United   |     80.6 |    150
 150 | Houston Dynamo |     80.6 |    151
 152 | Racing Club    |     80.5 |    153
 153 | Molde          |     80.5 |    154
 154 | Brøndby        |     80.5 |    155
 155 | Ludogorets     |     80.5 |    156
 156 | Orlando City   |     80.5 |    157
 157 | Monterrey      |     80.5 |    158
 158 | Malmö FF       |     80.4 |    159
 159 | Boca Juniors   |     80.4 |    160


In [4]:
## Enter the first team name
team1_name = input("Enter the first team name: ")

# Filter the DataFrame to get the data for the first team
team1_data = df[df['team'] == team1_name]

# Get the rating and ranking of the first team
team1_rating = team1_data['rating'].values[0]
team1_ranking = team1_data['rank'].values[0]

# Enter the second team name
team2_name = input("Enter the second team name: ")

# Filter the DataFrame to get the data for the second team
team2_data = df[df['team'] == team2_name]

# Get the rating and ranking of the second team
team2_rating = team2_data['rating'].values[0]
team2_ranking = team2_data['rank'].values[0]

# Calculate the difference in ranking between the two teams
ranking_difference = abs(team1_ranking - team2_ranking)

# Print the information for both teams
print("Team 1:")
print(f"Team: {team1_name}")
print(f"Rating: {team1_rating}")
print(f"Ranking: {team1_ranking}")
print()

print("Team 2:")
print(f"Team: {team2_name}")
print(f"Rating: {team2_rating}")
print(f"Ranking: {team2_ranking}")
print()

# Determine the relationship between the two teams based on ranking
if team1_ranking < team2_ranking:
    relationship = f"{team1_name} is ranked higher than {team2_name}"
elif team1_ranking > team2_ranking:
    relationship = f"{team1_name} is ranked lower than {team2_name}"
else:
    relationship = f"{team1_name} and {team2_name} are ranked equally"

# Print the relationship between the teams
print(f"Ranking Relationship: {relationship}")
print(f"Difference in Ranking: {ranking_difference} place(s)")

Enter the first team name: Wycombe Wanderers
Enter the second team name: Mladá Boleslav
Team 1:
Team: Wycombe Wanderers
Rating: 69.3
Ranking: 970

Team 2:
Team: Mladá Boleslav
Rating: 73.3
Ranking: 533

Ranking Relationship: Wycombe Wanderers is ranked lower than Mladá Boleslav
Difference in Ranking: 437 place(s)


In [None]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Specify the path to your installed chromedriver executable
CHROMEDRIVER_PATH = '/Users/marclambertes/Python/chromedriver'
MAX_OPTA_PAGE_NUM = 30
OPTA_URL = 'https://dataviz.theanalyst.com/opta-power-rankings/'

def scrape_opta_club_rankings():
    chrome_options = webdriver.ChromeOptions()
    
    # Remove '--headless' to allow viewing the browser for debugging
    # chrome_options.add_argument("--headless")  # Uncomment if you want headless mode again
    
    # Initialize the Chrome driver
    chrome_options.add_argument(f"webdriver.chrome.driver={CHROMEDRIVER_PATH}")
    driver = webdriver.Chrome(options=chrome_options)

    # Load the URL
    driver.get(OPTA_URL)

    try:
        # Use JavaScript to click the WOMENS tab if normal click doesn't work
        womens_tab = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'WOMENS')]"))
        )
        
        # Scroll to the WOMENS tab to ensure it's visible and ready to be clicked
        driver.execute_script("arguments[0].scrollIntoView(true);", womens_tab)
        time.sleep(1)  # Wait for scrolling to finish

        # Use JavaScript to click the button
        driver.execute_script("arguments[0].click();", womens_tab)
        print('Successfully switched to WOMENS rankings.')
        
        # Wait for the table to load after switching to WOMENS
        time.sleep(3)  # Adjust this delay as necessary depending on load time

        # Wait for the table to be fully loaded and visible
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, 'table'))
        )
    
    except Exception as e:
        print(f"Error occurred while trying to switch to WOMENS: {e}")
        driver.quit()
        return

    # Parse the page content after clicking the WOMENS tab
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    table = soup.find('table')
    
    # Ensure the table exists and extract headers
    if not table:
        print("Could not find the table on the page.")
        driver.quit()
        return
    
    # Extract headers from the table
    headers = [th.text.strip() for th in table.find_all('th')]
    headers.append('id')  # Add an id column for image id or other unique data if needed

    rows = []
    page_num = 1

    while page_num <= MAX_OPTA_PAGE_NUM:
        print(f'Scraping page {page_num}')
        
        # Re-parse the page content on each iteration in case it changes
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        table = soup.find('table')

        # Extract rows of data from the table
        for tr in table.find_all('tr'):
            row = [td.text.strip() for td in tr.find_all('td')]
            img = tr.select_one('img')
            img_id = img['src'].split('&id=')[-1] if img else ''  # Extract image ID if exists
            row.append(img_id)
            if row:
                rows.append(row)
        
        # Check if there's another page to click through
        if page_num < MAX_OPTA_PAGE_NUM:
            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'next-button-class')]"))
                )
                next_button.click()  # Click the next page button
                time.sleep(2)  # Wait for the next page to load
            except:
                print(f"No more pages after page {page_num}")
                break
            
        page_num += 1

    print('Done scraping WOMENS club rankings.')
    driver.quit()
    time.sleep(1)
    
    # Save the data to a pandas DataFrame
    df = pd.DataFrame(rows, columns=headers)
    df.dropna(subset=['team'], inplace=True)

    # Save the DataFrame to an Excel file
    excel_filename = 'opta_club_rankings_womens_03102024.xlsx'
    df.to_excel(excel_filename, index=False)
    print(f'Data saved to {excel_filename}')

if __name__ == "__main__":
    scrape_opta_club_rankings()
