Imports

In [10]:
import time
import unidecode
import csv
import threading
from selenium import webdriver
from datetime import datetime

Path to the .exe file that starts the Selenium plugin that starts chrome. As the pages I am about to scrape are javascript-based, with a help of Google, I find out it is more convenient (necessary) to go with selenium in order to view the page without refreshing and to be able to scrape it. 

In [2]:
path = 'chromedriver.exe'

Moreover, some of the websites (iFortuna, synottip) I am about to scrape display many matches and need to be scrolled down to the bottom of the page in order to scrape all of the matches - again, I used a help of Google to tackle this problem.

In [3]:
def scroll(driver):
    # First, we set pause time
    SCROLL_PAUSE_TIME = 0.5

    # Second, we get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # then, we scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page using predefined pause time
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height, if equal - stop, if not - keep going
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

Later on I found out, that synottip uses different way of scroller, so I used the help of Google again to find a way how to scroll down an inner element of a page. 

In [4]:
def scroll_synottip(driver):
    SCROLL_PAUSE_TIME = 1
    content = driver.find_element_by_class_name('content-container')
    content = content.find_element_by_class_name('simplebar-scroll-content')

    # Get scroll height - scrolling element found from inspected page
    rows = driver.find_elements_by_xpath(
        '//div[@data-test-role="event-list__item"]') 

    while True:
        # Scroll down to bottom
        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", content)

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_rows = driver.find_elements_by_xpath(
            '//div[@data-test-role="event-list__item"]')  

        if new_rows == rows:
            break
        rows = new_rows


Now we are finally ready to start scraping individual sports from individual pages. I decided to choose tenis and voleyball, as these two sports have (almost) only win-lose bets and thus it will be easier to both scrape and evaluate the data. The codes are basically identical.

With a help of Google I added threads to be able to run all the scrapers at the same time. 

In [9]:
def scrape_tenis():
    #first, we define threads that opens individual webpages with corresponding url (sport-relevant)
    #arguments scrape_sport_* used later on to find relevant .csv file and url for scraping correct site
    t1 = threading.Thread(target=scrape_sport_tipsport,
                          args=("tenis", "https://www.tipsport.cz/kurzy/tenis-43?limit=500",))
    t2 = threading.Thread(target=scrape_sport_ifortuna,
                          args=("tenis", "https://www.ifortuna.cz/sazeni/tenis",))
    t3 = threading.Thread(target=scrape_sport_synottip,
                          args=("tenis", "https://sport.synottip.cz/#/zapasy/19?categoryId=19",))
    t4 = threading.Thread(target=scrape_sport_chance,
                          args=("tenis", "https://www.chance.cz/kurzy/tenis-43?limit=500",))
    t1.start()
    t2.start()
    t3.start()
    t4.start()
    t1.join()
    t2.join()
    t3.join()
    t4.join()

    
def scrape_voleyball():
    t1 = threading.Thread(target=scrape_sport_tipsport,
                          args=("voleyball", "https://www.tipsport.cz/kurzy/volejbal-47?limit=500",))
    t2 = threading.Thread(target=scrape_sport_ifortuna,
                          args=("voleyball", "https://www.ifortuna.cz/sazeni/volejbal",))
    t3 = threading.Thread(target=scrape_sport_synottip,
                          args=("voleyball", "https://sport.synottip.cz/#/zapasy/23?categoryId=23",))
    t4 = threading.Thread(target=scrape_sport_chance,
                          args=("voleyball", "https://www.chance.cz/kurzy/volejbal-47?limit=500",))
    t1.start()
    t2.start()
    t3.start()
    t4.start()
    t1.join()
    t2.join()
    t3.join()
    t4.join()


Each of the betting company webpage has its own scraper which are very similar in general. Two of them use the aformentioned scrollers (synottip and ifortuna) and all of them differ in having unique page structure. As I have mentioned before tenis and voleyball have mostly only win-lose options which made the scraping slightly easier (data will be cleaned of different observations in the next section) and thus through inspecting individual pages I built individual scrapers.
For future analysis, it is important to save date from all the sources in the same format.

In [6]:
def scrape_sport_chance(sport, url):
    # Selenium driver
    driver = webdriver.Chrome(path)
    #empty field with matches that I will save to csv when filled with data
    matches = []
    #driver gets url where to scrape from
    driver.get(url)
    #3s break to ensure Chrome had enough time to start
    time.sleep(3)
    #fid rows with matches
    rows = driver.find_elements_by_class_name('o-matchRow__main')

    #And for each of these rows:
    for row in rows:
        #looking for match Name and date (playtime)
        left = row.find_element_by_class_name('o-matchRow__leftSide')
        #unidecode used everytime when name is stored in order to remove diacritics
        name = unidecode.unidecode(left.find_element_by_class_name('o-matchRow__matchName').text)
        dateSpans = left.find_element_by_class_name('o-matchRow__dateClosed').find_elements_by_xpath('span')
        #in case date and playtime are stored in different elements we need to combine these together and store the information
        #in specific format. In case playtime is not available only date is extracted - does not matter for future matching
        if len(dateSpans) > 1:
            date = left.find_element_by_class_name('o-matchRow__dateClosed').find_elements_by_xpath('span')[0].text
            date = date + ' ' + \
                   left.find_element_by_class_name('o-matchRow__dateClosed').find_elements_by_xpath('span')[1].text
            date = datetime.strptime(date, '%d.%m.%Y %H:%M')
        else:
            date = left.find_element_by_class_name('o-matchRow__dateClosed').find_elements_by_xpath('span')[0]
            date = datetime.strptime(date, '%d.%m.%Y')
        #in the right element we are scraping for the rates
        right = row.find_element_by_class_name('o-matchRow__rightSide')
        rates = right.find_element_by_class_name('o-matchRow__rightSideInner').find_element_by_class_name(
            'm-matchRowOdds').find_elements_by_class_name('btnRate')
        #we predefine win-draw-loss (home-draw-away) as zeros (including draw for possible future use with different sports
        #including this option) in order to detect possible unsuitable observations missing either home or away values.
        home = 0
        away = 0
        draw = 0
        if len(rates) == 2:
            home = rates[0].text
            away = rates[1].text
        if len(rates) == 3:
            home = rates[0].text
            draw = rates[1].text
            away = rates[2].text
        #if home or away was not found, dont append the observation
        if home != 0 and away != 0:
            matches.append([name, date, home, draw, away])
    #fill .csv file with the scraped matches (rewrites the existing values in provided datafile with fresh ones)
    with open('chance/' + sport + '.csv', 'w', newline='', encoding='utf-8') as f:
        write = csv.writer(f)
        write.writerow(['name', 'date', 'home', 'draw', 'away'])
        write.writerows(matches)
    #close chrome driver
    driver.quit()

#same as above, except for using scroller
def scrape_sport_synottip(sport, url):
    driver = webdriver.Chrome(path)
    matches = []
    driver.get(url)
    time.sleep(3)
    scroll_synottip(driver)
    rows = driver.find_elements_by_xpath(
        '//div[@data-test-role="event-list__item"]')  

    for row in rows:
        odds = row.find_elements_by_class_name('rate')
        name = unidecode.unidecode(row.find_element_by_class_name('match-label').text)
        if len(odds) > 1:
            try:
                #here we need to edit existing strings in order to reformat as float due to different float formatting
                date = row.find_element_by_class_name('v-center').text.replace('\n', ' ')
                date = datetime.strptime(date, '%d.%m.%y %H:%M')
                odds1 = odds[0].text
                odds1 = float(odds1.replace(",", "."))
                odds2 = odds[1].text
                odds2 = float(odds2.replace(",", "."))
            except:
                continue

            home = 0
            away = 0
            draw = 0
            if len(odds) == 2:
                home = odds1
                away = odds2
            if len(odds) == 3:
                home = odds1
                draw = odds2
                odds3 = odds[2].text
                odds3 = float(odds3.replace(",", "."))
                away = odds3
            if home != 0 and away != 0:
                matches.append([name, date, home, draw, away])

    with open('synottip/' + sport + '.csv', 'w', newline='', encoding='utf-8') as f:
        write = csv.writer(f)
        write.writerow(['name', 'date', 'home', 'draw', 'away'])
        write.writerows(matches)
    driver.quit()

#same as above, except for using different scroller
def scrape_sport_ifortuna(sport, url):
    driver = webdriver.Chrome(path)
    matches = []
    driver.get(url)
    time.sleep(3)
    scroll(driver)
    rows = driver.find_elements_by_xpath("//table/tbody/tr")
    for row in rows:
        try:
            home = 0
            draw = 0
            away = 0
            name = row.find_element_by_class_name('col-title').text
            date = datetime.fromtimestamp(
                float(row.find_element_by_class_name('col-date').get_attribute("data-value")) / 1000)
            rates = row.find_elements_by_class_name('col-odds')
            if len(rates) == 2:
                home = rates[0].text
                away = rates[1].text
            if len(rates) >= 3:
                home = rates[0].text
                draw = rates[1].text
                away = rates[2].text
            matches.append([name, date, home, draw, away])
        except:
            continue

    if len(matches) > 0:
        with open('ifortuna/' + sport + '.csv', 'w', newline='', encoding='utf-8') as f:
            write = csv.writer(f)
            write.writerow(['name', 'date', 'home', 'draw', 'away'])
            write.writerows(matches)
    driver.quit()

#identical to chance (much likely the same owner - same webpage, same rates (we find later on))
def scrape_sport_tipsport(sport, url):
    driver = webdriver.Chrome(path)
    matches = []
    driver.get(url)
    time.sleep(3)
    rows = driver.find_elements_by_class_name('o-matchRow__main')

    for row in rows:
        left = row.find_element_by_class_name('o-matchRow__leftSide')
        name = unidecode.unidecode(left.find_element_by_class_name('o-matchRow__matchName').text)
        dateSpans = left.find_element_by_class_name('o-matchRow__dateClosed').find_elements_by_xpath('span')
        if len(dateSpans) > 1:
            date = left.find_element_by_class_name('o-matchRow__dateClosed').find_elements_by_xpath('span')[0].text
            date = date + ' ' + \
                   left.find_element_by_class_name('o-matchRow__dateClosed').find_elements_by_xpath('span')[1].text
            date = datetime.strptime(date, '%d.%m.%Y %H:%M')
        else:
            date = left.find_element_by_class_name('o-matchRow__dateClosed').find_elements_by_xpath('span')[0]
            date = datetime.strptime(date, '%d.%m.%Y')
        right = row.find_element_by_class_name('o-matchRow__rightSide')
        rates = right.find_element_by_class_name('o-matchRow__rightSideInner').find_element_by_class_name(
            'm-matchRowOdds').find_elements_by_class_name('btnRate')
        home = 0
        away = 0
        draw = 0
        if len(rates) == 2:
            home = rates[0].text
            away = rates[1].text
        if len(rates) == 3:
            home = rates[0].text
            draw = rates[1].text
            away = rates[2].text

        if home != 0 and away != 0:
            matches.append([name, date, home, draw, away])

    with open('tipsport/' + sport + '.csv', 'w', newline='', encoding='utf-8') as f:
        write = csv.writer(f)
        write.writerow(['name', 'date', 'home', 'draw', 'away'])
        write.writerows(matches)
    driver.quit()


Run scrapers for both sports for each of the webpages

In [7]:
scrape_voleyball()
scrape_tenis()