## Web scraping automation with Selenium. Forebet.com predictions U_O 2.5 

#### Import libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#### Activate ChromeDriver and open a new session of Chrome

In [2]:
# Specify the path to the ChromeDriver executable
chromedriver_path = 'C:\WebDrivers\chromedriver.exe'

# Create a new service object with the specified executable path
service = Service(chromedriver_path)

# Configure the ChromeDriver options
options = Options()
options.binary_location = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
#options.add_argument('--headless') # Run Chrome in headless mode

# Create a new Chrome WebDriver instance with the custom service and options
driver = webdriver.Chrome(service=service, options=options)

#### Open the URL

In [3]:
# driver get the URL
driver.get('https://www.forebet.com/en/football-predictions/under-over-25-goals')

#### Press the "Consent" button. Use one of the option below 

In [4]:
# 1st way. Wait for the cookie button to be clickable and press consent
#WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.fc-button.fc-cta-consent.fc-primary-button > p'))).click()

In [4]:
# 2nd way. Wait for the cookie button to be clickable and press consent
consent_button = driver.find_element("xpath", '(//p[@class="fc-button-label"])[1]')
consent_button.click()

#### Find a way to overpass the iframe. Work in progress....
For the moment this step should be done by manually click the iframe arrow when the pop up appears in the bottom down 

In [1]:
...

Ellipsis

#### Find and click "More" button to load all the games

In [5]:
# Find and click the "More" button
for i in range(50):
    try:
        # Wait for the element to become visible and clickable
        element = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#mrows span")))
        # Click the element
        element.click()
    except:
        print("No 'more' buttons to click!")
        break

No more buttons to click!


#### Define the data frame columns

In [6]:
# define data frame
df_pred = pd.DataFrame(columns=['Date Time', 'Home Team', 'Away Team', 'Under Prob',
                                'Over Prob', 'U/O', 'Predicted Score', 'Odds'])

#### Extract the data and compare the lenght for each column

In [7]:
date = driver.find_elements("xpath", '//time[@itemprop="startDate"]/span')

In [8]:
len(date)

1565

In [9]:
home_team = driver.find_elements("xpath", '(//span[@class="homeTeam"])/span[1]')

In [10]:
len(home_team)

1565

In [11]:
away_team = driver.find_elements("xpath", '(//span[@class="awayTeam"])/span[1]')

In [12]:
len(away_team)

1565

In [13]:
under_prob = driver.find_elements("xpath", '(//div[@class="fprc"])/span[1]')

In [14]:
len(under_prob)

1563

In [15]:
over_prob = driver.find_elements("xpath", '(//div[@class="fprc"])/span[2]')

In [16]:
len(over_prob)

1563

In [17]:
u_o = driver.find_elements("xpath", '(//span[starts-with(@class, "forepr ")])/span')

In [18]:
len(u_o)

1563

In [19]:
predicted_score = driver.find_elements("xpath", '(//div[starts-with(@class, "rcnt ")])/div[5]')

In [20]:
len(predicted_score)

1563

In [21]:
odds = driver.find_elements("xpath", '(//div[starts-with(@class, "rcnt ")])/div[8]/span')

In [22]:
len(odds)

1563

In [23]:
len(u_o)

1563

#### Append all the data to the Data Frame and preview the df_pred
Here there is room for improvement. Right now it takes a bit of time to append due to the amount of data it needs to scrape.

In [24]:
# in Dataframe appenden
for i in range(len(odds)):

    df_pred =df_pred.append({'Date Time':date[i].text, 'Home Team': home_team[i].text,
                            'Away Team': away_team[i].text, 'Under Prob': under_prob[i].text, 
                            'Over Prob': over_prob[i].text, 'U/O': u_o[i].text,
                            'Predicted Score': predicted_score[i].text, 'Odds': odds[i].text}, ignore_index=True)

In [25]:
df_pred

Unnamed: 0,Date Time,Home Team,Away Team,Under Prob,Over Prob,U/O,Predicted Score,Odds
0,3/6/2023 16:00,Manchester City,Manchester United,52,48,Under,1 - 0,2.30
1,3/6/2023 21:00,Troyes AC,Lille OSC,58,42,Under,0 - 2,3.00
2,3/6/2023 21:00,FC Nantes,Angers SCO,46,54,Over,2 - 1,1.67
3,3/6/2023 21:00,OGC Nice,Lyon,51,49,Under,0 - 2,-
4,3/6/2023 21:00,AC Ajaccio,Olympique Marseille,48,52,Over,1 - 2,1.62
...,...,...,...,...,...,...,...,...
1558,4/6/2023 15:00,Green Fuel,Platinum FC,90,10,Under,0 - 1,-
1559,4/6/2023 15:00,Dynamos FC (ZIM),ZPC Kariba,82,18,Under,2 - 0,-
1560,4/6/2023 15:00,Hwange Colliery,Manica Diamonds FC,62,38,Under,1 - 1,-
1561,4/6/2023 15:00,Highlanders FC,Bulawayo Chiefs,59,41,Under,0 - 0,-


#### Split the columns "Date", "Predicted Score" and "Probability" and drop them

In [26]:
df_pred[['Date', 'Time']] = df_pred['Date Time'].str.split(' ', expand=True)
df_pred[['Score Home', 'Score Away']] = df_pred['Predicted Score'].str.split('-', expand=True)

# drop the columns not needed
df_pred = df_pred.drop(['Date Time', 'Predicted Score'], axis=1)

#### Re-arrange the columns

In [27]:
# re arrange the columns 
df_pred = df_pred[['Date', 'Time', 'Home Team', 'Away Team', 'Under Prob', 'Over Prob', 'U/O', 'Score Home', 
         'Score Away', 'Odds']]

In [29]:
df_pred

Unnamed: 0,Date,Time,Home Team,Away Team,Under Prob,Over Prob,U/O,Score Home,Score Away,Odds
0,18/5/2023,20:30,Newcastle United,Brighton,59,41,Under,0,2,2.30
1,20/5/2023,13:30,Tottenham,Brentford,60,40,Under,1,1,2.10
2,20/5/2023,16:00,Bournemouth,Manchester United,38,62,Over,1,3,1.53
3,20/5/2023,16:00,Fulham,Crystal Palace,37,63,Over,2,1,2.00
4,20/5/2023,16:00,Liverpool,Aston Villa,49,51,Over,2,1,1.53
...,...,...,...,...,...,...,...,...,...,...
2205,27/5/2023,0:00,Lumwana Radiants,Man Utd Zambia,42,58,Over,1,2,-
2206,27/5/2023,0:00,Prison Leopards,Napsa Stars,56,44,Under,0,2,-
2207,27/5/2023,0:00,Green Buffaloes,Zanaco FC,56,44,Under,2,0,-
2208,27/5/2023,0:00,Nkana FC,Kansanshi Dynamos,81,19,Under,0,0,-


#### Store the data in CSV format 

In [28]:
# create csv
df_pred.to_csv('Extract O-U 2.5.csv', encoding='utf-8', index=False)