# Scraping Interactive Match Links From Main Page

In [28]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import re
from selenium.webdriver.chrome.service import Service

# Set up the WebDriver (adjust the path as per your setup)
#driver = webdriver.Chrome('C:/Users/99451/AppData/Local/Programs/Python/Python312/Lib/site-packages/selenium/webdriver/chromedriver-win64/chromedriver.exe')

# Initialize the driver
driver_path = 'C:/Users/99451/AppData/Local/Programs/Python/Python312/Lib/site-packages/selenium/webdriver/chromedriver-win64/chromedriver.exe'
service = Service(driver_path)
driver = webdriver.Chrome(service=service)

# Open the OddsPortal page
driver.get('https://www.oddsportal.com/football/england/premier-league/results/')

# Function to scroll the page and load more matches
def scroll_to_load_matches():
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to the bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for the new content to load

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break  # Exit the loop if no more matches are loaded
        last_height = new_height

# Scroll to load all matches
scroll_to_load_matches()

# Find all links on the page
links = driver.find_elements(By.TAG_NAME, 'a')

# Extract URLs from the link elements
urls = [link.get_attribute('href') for link in links if link.get_attribute('href')]

# Close the driver
driver.quit()

# Filter out None or empty links
urls = [url for url in urls if url]

# Dictionary to store longest URLs for each match
longest_urls = {}

# Regex pattern to identify match URLs (this may need to be adjusted based on the actual URL structure)
match_pattern = re.compile(r'/football/.+/.+/[^/]+-[^/]+-[\w-]+/')

for url in urls:
    # Check if the URL matches the match pattern
    if match_pattern.search(url):
        # Extract match identifier
        parts = url.split('/')
        match_identifier = parts[-2]  # This may represent the match

        # Store the longest URL for each match
        if match_identifier not in longest_urls or len(url) > len(longest_urls[match_identifier]):
            longest_urls[match_identifier] = url

# Convert the longest_urls dictionary to a list of tuples
longest_url_items = list(longest_urls.items())

# Print the first 5 matches and links
print("First 5 Matches:")
for match, long_url in longest_url_items[:5]:
    print(f"Match Identifier: {match}, Longest URL: {long_url}")

# Print a separator
print("\n...\n")  # Optional separator between sections

# Print the last 5 matches and links
print("Last 5 Matches:")
for match, long_url in longest_url_items[-5:]:
    print(f"Match Identifier: {match}, Longest URL: {long_url}")


First 5 Matches:
Match Identifier: bournemouth-arsenal-ShJjaSDs, Longest URL: https://www.oddsportal.com/football/england/premier-league/bournemouth-arsenal-ShJjaSDs/
Match Identifier: fulham-aston-villa-Y5lic6rf, Longest URL: https://www.oddsportal.com/football/england/premier-league/fulham-aston-villa-Y5lic6rf/
Match Identifier: ipswich-everton-tWiaepD6, Longest URL: https://www.oddsportal.com/football/england/premier-league/ipswich-everton-tWiaepD6/
Match Identifier: manchester-united-brentford-UiYgHRZP, Longest URL: https://www.oddsportal.com/football/england/premier-league/manchester-united-brentford-UiYgHRZP/
Match Identifier: newcastle-utd-brighton-n9EQB5cm, Longest URL: https://www.oddsportal.com/football/england/premier-league/newcastle-utd-brighton-n9EQB5cm/

...

Last 5 Matches:
Match Identifier: manchester-city-brentford-tzSnG5Wr, Longest URL: https://www.oddsportal.com/football/england/premier-league/manchester-city-brentford-tzSnG5Wr/
Match Identifier: southampton-manches

# Getting Inside of Match Links

In [29]:
from bs4 import BeautifulSoup
import pandas as pd

# links = ['Full Time', 'Double Chance', 'Half Time', 'HT Double Chance', 'BTTS']
extensions = ['#1X2;2', '#double;2', '#1X2;3', '#double;3' ,'#bts;2']
full_time_links = [link + extensions[0] for link in longest_urls.values()]

driver = webdriver.Chrome(service=service)

# Extracting float values from the corresponding elements
home, away, ft1, ftx, ft2 = [], [], [], [], []

for link in full_time_links:
    driver.get(link)

    # Give time for dynamic content to load
    driver.implicitly_wait(10)  # or time.sleep(seconds)

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    elements = soup.select('.text-\[\#2F2F2F\] .height-content , .max-sm\:gap-1')
    teams = soup.select('.leading-4 p')

    # Extract the teams content from each span element
    match_text = teams[0].get_text()
    home_team, away_team = [team.strip() for team in match_text.split('-')]

    # Find the index of the element containing the text "bet365"
    bet365_index = next((index for index, element in enumerate(elements) if 'bet365' in str(element)), None)

    # Now we get the indexes for the odds
    odds_indexes = [bet365_index + 1, bet365_index + 2, bet365_index + 3]

    home.append(home_team)
    away.append(away_team)
    ft1.append(float(elements[bet365_index + 1].text))
    ftx.append(float(elements[bet365_index + 2].text))
    ft2.append(float(elements[bet365_index + 3].text))

driver.quit()

# Create a DataFrame from the lists
df = pd.DataFrame({
    'Home': home,
    'Away': away,
    'FT1': ft1,
    'FTX': ftx,
    'FT2': ft2
})

df

  elements = soup.select('.text-\[\#2F2F2F\] .height-content , .max-sm\:gap-1')


Unnamed: 0,Home,Away,FT1,FTX,FT2
0,Bournemouth,Arsenal,4.75,3.7,1.75
1,Fulham,Aston Villa,2.45,3.4,2.8
2,Ipswich,Everton,2.38,3.4,2.9
3,Manchester Utd,Brentford,1.62,4.2,5.0
4,Newcastle,Brighton,1.73,4.0,4.5
5,Southampton,Leicester,2.15,3.6,3.25
6,Tottenham,West Ham,1.45,5.25,5.75
7,Brighton,Tottenham,2.9,3.9,2.15
8,Aston Villa,Manchester Utd,2.1,3.9,3.1
9,Chelsea,Nottingham,1.48,4.5,6.5


In [30]:
type(teams)

bs4.element.ResultSet