# Scraping Interactive Match Links From Main Page

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import re
from selenium.webdriver.chrome.service import Service

# Set up the WebDriver (adjust the path as per your setup)
#driver = webdriver.Chrome('C:/Users/99451/AppData/Local/Programs/Python/Python312/Lib/site-packages/selenium/webdriver/chromedriver-win64/chromedriver.exe')

# Initialize the driver
driver_path = 'C:/Users/99451/AppData/Local/Programs/Python/Python312/Lib/site-packages/selenium/webdriver/chromedriver-win64/chromedriver.exe'
service = Service(driver_path)
driver = webdriver.Chrome(service=service)

# Open the OddsPortal page
driver.get('https://www.oddsportal.com/football/england/premier-league/results/')

# Function to scroll the page and load more matches
def scroll_to_load_matches():
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to the bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for the new content to load

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break  # Exit the loop if no more matches are loaded
        last_height = new_height

# Scroll to load all matches
scroll_to_load_matches()

# Find all links on the page
links = driver.find_elements(By.TAG_NAME, 'a')

# Extract URLs from the link elements
urls = [link.get_attribute('href') for link in links if link.get_attribute('href')]

# Print or filter URLs based on a pattern
#for url in urls:
    #if 'oddsportal.com/football' in url:
        #print(url)

# Close the driver
driver.quit()


# Filter out None or empty links
urls = [url for url in urls if url]

# Dictionary to store longest URLs for each match
longest_urls = {}

# Regex pattern to identify match URLs (this may need to be adjusted based on the actual URL structure)
match_pattern = re.compile(r'/football/.+/.+/[^/]+-[^/]+-[\w-]+/')

for url in urls:
    # Check if the URL matches the match pattern
    if match_pattern.search(url):
        # Extract match identifier
        parts = url.split('/')
        match_identifier = parts[-2]  # This may represent the match

        # Store the longest URL for each match
        if match_identifier not in longest_urls or len(url) > len(longest_urls[match_identifier]):
            longest_urls[match_identifier] = url

# Convert the longest_urls dictionary to a list of tuples
longest_url_items = list(longest_urls.items())

# Print the first 5 matches and links
print("First 5 Matches:")
for match, long_url in longest_url_items[:5]:
    print(f"Match Identifier: {match}, Longest URL: {long_url}")

# Print a separator
print("\n...\n")  # Optional separator between sections

# Print the last 5 matches and links
print("Last 5 Matches:")
for match, long_url in longest_url_items[-5:]:
    print(f"Match Identifier: {match}, Longest URL: {long_url}")


First 5 Matches:
Match Identifier: brighton-tottenham-fsjksfhd, Longest URL: https://www.oddsportal.com/football/england/premier-league/brighton-tottenham-fsjksfhd/
Match Identifier: aston-villa-manchester-united-ltBAlx0M, Longest URL: https://www.oddsportal.com/football/england/premier-league/aston-villa-manchester-united-ltBAlx0M/
Match Identifier: chelsea-nottingham-nywy5cp3, Longest URL: https://www.oddsportal.com/football/england/premier-league/chelsea-nottingham-nywy5cp3/
Match Identifier: everton-newcastle-utd-l0Zi1FqS, Longest URL: https://www.oddsportal.com/football/england/premier-league/everton-newcastle-utd-l0Zi1FqS/
Match Identifier: arsenal-southampton-SQM1jbV9, Longest URL: https://www.oddsportal.com/football/england/premier-league/arsenal-southampton-SQM1jbV9/

...

Last 5 Matches:
Match Identifier: everton-bournemouth-x6HuiG3U, Longest URL: https://www.oddsportal.com/football/england/premier-league/everton-bournemouth-x6HuiG3U/
Match Identifier: ipswich-fulham-SUdkBJln

# Getting Inside of Match Links

In [85]:
from bs4 import BeautifulSoup

# links = ['Full Time', ]
extensions = ['#1X2;2']
full_time_links = [link + extensions[0] for link in longest_urls.values()]

driver = webdriver.Chrome(service=service)
driver.get(full_time_links[0])

# Give time for dynamic content to load
driver.implicitly_wait(10)  # or time.sleep(seconds)

html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

elements = soup.select('.text-\[\#2F2F2F\] .height-content , .max-sm\:gap-1')
print(f"Found {len(elements)} elements")  # This should give you the correct count

driver.quit()

# Find the index of the element containing the text "bet365"
bet365_index = next((index for index, element in enumerate(elements) if 'bet365' in str(element)), None)

# Now we get the indexes for the odds
odds_indexes = [bet365_index + 1, bet365_index + 2, bet365_index + 3]

# Extracting float values from the corresponding elements
odds = []
for index in odds_indexes:
    # Convert the Tag element to string and then to float
    odds_value = float(elements[index].text)  # Use .text to get the text inside the Tag
    odds.append(odds_value)

print(odds)  # This will print the list of float values for the odds

  elements = soup.select('.text-\[\#2F2F2F\] .height-content , .max-sm\:gap-1')


Found 53 elements
[2.9, 3.9, 2.15]
