In [30]:
import time
import json
from selenium import webdriver

# Lets first get all the URLs for matches from 2009-2017

In [18]:
def get_season_url(year):
    driver = webdriver.Chrome()
    get_all_fixtures(driver, year) # make we are the landing page to select the week
    time.sleep(5) # make sure webpage is loaded
    
    season_urls = {}
    
    for w in range(20): # 20 make sure to get all the rounds 
        urls = get_week_urls(driver, w+1, year)
        if len(urls)>0:
            season_urls['week_%s' % str(w+1)] = urls
            print('week '+str(w+1)+' urls extracted')
    
    driver.close()
    write_data(season_urls, 'Data/URLS/', year)

In [3]:
def get_all_fixtures(driver, year):
    main_page = 'https://sanzarrugby.com/superrugby/fixtures/archives/'
    driver.get(main_page) # go to the archives page
    
    link_season = "/superrugby/fixtures/archives/%s-super-rugby/" % year
    season = driver.find_element_by_xpath('//a[@href="'+link_season+'"]')
    season.click() # go to season page
    
    link_fixtures = "#fixtures"
    fixtures = driver.find_element_by_xpath('//a[@href="'+link_fixtures+'"]')
    fixtures.click() # go to all matches for the season

In [9]:
def get_week_urls(driver, week, year):
    select_week(driver, week)
    
    return get_match_urls(driver, week)

In [31]:
def select_week(driver, week):
    round_filter = "roundFilter"
    round_menu = driver.find_element_by_id(round_filter)
    round_menu.click() # get the drop down meanu to select the week
    
    link_round = "#round%s" % week
        
    try:
        round_  = driver.find_element_by_xpath('//a[@href="'+link_round+'"]')
        print(round_.text)
        if ('Week' not in round_.text) & ('Round' not in round_.text): # we are interested only in league stage for now
            return None
        round_.click() # go to the week
    except:
        pass # week number is out of range

In [6]:
def get_match_urls(driver, week):
    # get match links
    link_path = '//*[@id="Opta_%s"]/div/div/div/table/tbody[%s]/tr/td[9]/a'
    match_links = []
    for i in range(20): # 20 makesure to capture all the rows of the table with fixtures
        ln_p = link_path % (str(week), str(i+1)) #xpath for the match
        
        try:
            ln =  driver.find_element_by_xpath(ln_p)
            match_links.append(ln.get_attribute('href'))
        except:
            pass # element not found for out of range number of rows int he table

    return match_links

In [31]:
def write_data(data, filepath, year):
    with open(filepath + year + '.json', 'w') as fp:
        json.dump(data, fp)

In [None]:
# get all urls
for y in ['2017','2016','2015','2014','2013','2012']:
    get_season_url(y)

## Lets now get data for matches using URLS

In [32]:
def get_season(year):
    
    COMPLETE =  False
    CURRENT_WEEK = 0
    CURRENT_MATCH = 0
    season = {}
    while not COMPLETE:
        CLOSE = 0
        driver = webdriver.Chrome()

        urls = get_json_data('Data/URLS/'+year+'.json')
        
        match_variables = get_json_data('match_variables.json')
            
        for k,v in match_variables.items():
            match_variables[k] = v.replace("'", '"') # reverse changes made for varible have valid JSON form

        
        BREAK = False
        while not COMPLETE:
            for i, week in enumerate(urls):

                if i < CURRENT_WEEK: # make sure weeks before are not repeated
                    continue

                print('Data extraction started for week_%s' % str(i+1))

                matches = {}
                for j, u in enumerate(urls[week]): # one url at a time 
                    
                    if (i <= CURRENT_WEEK) & (j < CURRENT_MATCH): 
                        # make sure previous matches are not repeated
                        continue
                        
                    # special case website not responding
                    if (year == '2013') & (i==6) & (j==6):
                        continue

                    try:
                        matches['match_%s' % str(j+1)] = get_match_data(driver, u, 
                                                                        match_variables)
                        print('Data extracted match_%s' % str(j+1))
                    except:
                        BREAK = True
                        CURRENT_WEEK = i
                        CURRENT_MATCH = j
                        break

                try: # make all the matches scraped so far is stored
                    season['week_%s' % str(i+1)] = {**season['week_%s' % str(i+1)], 
                                                    **matches}
                except:
                    season['week_%s' % str(i+1)] = matches

                if BREAK:
                    break

            if BREAK: 
                BREAK = False
                CLOSE += 1
                if CLOSE == 5: break
            else: COMPLETE = True

        driver.close()       
    write_data(season, 'Data/Match_info/', year)

In [33]:
def get_json_data(filename):
    with open(filename, 'r') as js_data:
        return json.load(js_data)

In [49]:
def get_match_data(driver, url, match_variables):
    driver.get(url)
    time.sleep(3)
    
    match = {}

    for k,v in match_variables.items():
        match[k] = driver.find_element_by_xpath(v).text
        
    match = get_possession_info(driver, match) # possession data has a different structure

    return match

In [50]:
def get_possession_info(driver, match):
    pos = driver.find_elements_by_class_name('Opta-Territories-value')
    
    var = ["team1_25","team2_op_25","team1_40","team2_op_40", 
           "team1_op_40","team2_40","team1_op_25","team2_25"]
    
    for i,p in enumerate(pos):
        match[var[i]] = p.text
        
    match["overall_pos1"] = driver\
                .find_element_by_css_selector(".Opta-Possession-value.Opta-Home").text
    match["overall_pos2"] = driver\
                .find_element_by_css_selector(".Opta-Possession-value.Opta-Away").text
    
    return match

In [None]:
# get seasons
for y in ['2017','2016','2015','2014', '2013','2012']:
    get_season(y)