In [None]:
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup
import re
import csv
import pandas as pd
import numpy as np

In [None]:
#intiating selenium Chrome driver
driver = webdriver.Chrome(executable_path='C:/Users/Nik/OneDrive/dataScience/DSProjects/ChromeDriver/chromedriver.exe')

In [None]:
#functions

def getYearMatchURLs(year = 2009):

    yearURL = 'https://super.rugby/superrugby/fixtures/archives/' + str(year) + '-super-rugby/'

    driver.get(yearURL)

    sleep(3)#give time for javascript to load more HTML

    #store HTML in a soup
    soup = BeautifulSoup(driver.find_element_by_tag_name('html').get_attribute('innerHTML'), 'html.parser')
    
    #Count how many rounds there are
    roundCount = len(soup.findAll("a", {"href" : re.compile('#round.*')}))
    
    #Get URL of first "Match Page" of year
    driver.find_element_by_xpath("//a[@data-toggle='tab' and contains(text(),'Fixtures')]").click()

    sleep(3)

    driver.find_element_by_xpath("//button[@data-toggle='dropdown']").click()
    #loading last round page loads links for all the year's games
    driver.find_element_by_xpath("//a[@href='#round" + str(roundCount) + "']").click()

    sleep(3)
    
    allLinks = driver.find_elements_by_xpath("//i[@class='fa fa-chevron-right']")
    for i in range(len(allLinks)):
        allLinks[i] = allLinks[i].find_element_by_xpath("./..").get_attribute('href')
    
    return allLinks

def getMatchData(matchURL):
    
    driver.get(matchURL)
    
    sleep(3)
    
    soup = BeautifulSoup(driver.find_element_by_tag_name('html').get_attribute('innerHTML'), 'html.parser')
    
    
    matchYear = matchURL[52:56]
    matchID = matchURL[-3:]
        
    try:
        matchHeader = soup.find("table", {"class" : re.compile('Opta-MatchHeader Opta-MatchHeader-Crested.*')})
        
    
        leftTeam = matchHeader.find("td", {"class" : re.compile('Opta-Team-Left Opta-TeamName.*')}).renderContents().strip().decode()
        rightTeam = matchHeader.find("td", {"class" : re.compile('Opta-Team-Right Opta-TeamName.*')}).renderContents().strip().decode()
    
        
        scores = matchHeader.findAll("td", {"class": re.compile('Opta-Score.*')})
        leftScore = scores[0].find("span").renderContents().strip().decode()
        rightScore = scores[1].find("span").renderContents().strip().decode()

        refName = soup.find("div", {"class" : "Opta-Matchdata"}).find("dd").renderContents().strip().decode()

        for tbody in soup('tbody'):
            for tr in tbody('tr'):
                if tr.text =='Penalties conceded':
                    leftPenalties = int(tbody('tr')[1]('td')[0].text)
                    rightPenalties = int(tbody('tr')[1]('td')[2].text)
    except:
        leftTeam = matchURL
        rightTeam = matchURL
        leftScore = matchURL
        rightScore = matchURL
        refName = matchURL
        leftPenalties = matchURL
        rightPenalties = matchURL
    
    return [matchYear, matchID, leftTeam, leftScore, rightTeam, rightScore, refName, leftPenalties, rightPenalties]

In [None]:
yearURLs_dict = {}

for year in range(2009, 2020):
    yearURLs_dict[year] = getYearMatchURLs(year)

In [None]:
#check we have all URLs
for year in yearURLs_dict:
    print(len(yearURLs_dict[year]))

In [None]:
#Save URLs as csv just in case
#row by row to recreate dict easier
#from https://thispointer.com/python-how-to-append-a-new-row-to-an-existing-csv-file/

for key in yearURLs_dict.keys():
    
    with open('yearURLs.csv', 'a', newline='') as file:
        # Create a writer object from csv module
        csv_writer = csv.writer(file)
        # Add contents of list as last row in the csv file
        csv_writer.writerow(yearURLs_dict[key])

In [None]:
#recreating dict from the csv file
yearURLs_dict = {}

for year in range(2009, 2020):
    yearURLs_dict[year] = []

with open('yearURLs.csv') as file:
    for key in yearURLs_dict.keys():
        line = file.readline().split(",")
        line[-1] = line[-1].strip()
        yearURLs_dict[key] = line

In [None]:
data_dict ={
    'matchYear':[],
    'matchID':[],
    'leftTeam':[],
    'leftScore':[],
    'rightTeam':[],    
    'rightScore':[],
    'refName':[],
    'leftPenalties':[],
    'rightPenalties':[]
    }

In [None]:
#Get data from each match page

for key in yearURLs_dict.keys():
    for URL in yearURLs_dict[key]:
        matchStats = getMatchData(URL)
        data_dict['matchYear'].append(matchStats[0])
        data_dict['matchID'].append(matchStats[1])
        data_dict['leftTeam'].append(matchStats[2])
        data_dict['rightTeam'].append(matchStats[3])
        data_dict['leftScore'].append(matchStats[4])
        data_dict['rightScore'].append(matchStats[5])        
        data_dict['refName'].append(matchStats[6])
        data_dict['leftPenalties'].append(matchStats[7])
        data_dict['rightPenalties'].append(matchStats[8])
        
print(data_dict)

In [None]:
data_df = pd.DataFrame(data_dict)

In [None]:
#save df as csv to manually find missing data
data_df.to_csv('data.csv', index=False)

In [None]:
#recreate df from updated csv file
data_df = pd.read_csv('data.csv')

In [None]:
#correct column headers
data_df.rename(columns = {'rightTeam' : 'leftScore', 'leftScore' : 'rightTeam'}, inplace = True)

In [None]:
data_df['winningMargin'] = (data_df['leftScore'] - data_df['rightScore']).abs()

In [None]:
data_df['penaltyDiff(l-r)'] = data_df['leftPenalties'] - data_df['rightPenalties']

In [None]:
#Get array of unique ref names
refName_series = pd.Series(data_df['refName'].unique())
#save as csv to manually add refs' countries
refName_series.to_csv('refNames.csv', index = False, header = False)

In [None]:
#create refCountry_dict to get ref's country in data_df
with open('refNames.csv', mode='r') as file:
    refCountry_dict = {rows[0]:rows[1] for rows in csv.reader(file)}

#do the same with team countries
with open('teamCountries.csv', mode='r') as file:
    teamCountry_dict = {rows[0]:rows[1] for rows in csv.reader(file)}

In [None]:
data_df['refCountry'] = data_df['refName'].map(refCountry_dict) 

In [None]:
data_df['leftCountry'] = data_df['leftTeam'].map(teamCountry_dict)
data_df['rightCountry'] = data_df['rightTeam'].map(teamCountry_dict)