# Match Stat Scraper

When I originally worked on this project, I only scraped data for the regular season of 2018 and part of 2019. The remainder of 2018 was finished in another Notebook. The below code scrapes all missing data from 2013 - 2019.

In [1]:
#Scraping Imports
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
#SQL Imports
import mysql.connector
#Pandas imports
import pandas as pd

import os

In [2]:
#DB Connection
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  passwd="",
  database="NRL_data"
)
mycursor = mydb.cursor(buffered=True)

ProgrammingError: 1045 (28000): Access denied for user 'root'@'localhost' (using password: NO)

## 1. Get URL's of all matches that have not been scraped

In [171]:
#Get all matches
all_matches_query = pd.read_sql_query('SELECT id, date, url, home_team_id, away_team_id FROM Matches;', mydb)
all_match_df = pd.DataFrame(all_matches_query, columns=['id', 'date', 'url', 'home_team_id', 'away_team_id'])

#Find matches that were already scraped
already_scraped = 'SELECT DISTINCT match_id FROM PlayerMatchStats;'
mycursor.execute(already_scraped,)
results = mycursor.fetchall()
already_scraped_list = list(map(lambda x: x[0], results))

#Remove matches which were already scraped and save remaining match info to not_yet_scraped_df
not_yet_scraped = set(list(all_match_df['id'])) - set(already_scraped_list)
not_yet_scraped_df = all_match_df[all_match_df['id'].isin(not_yet_scraped)]
not_yet_scraped_df

Unnamed: 0,id,date,url,home_team_id,away_team_id
1398,1467,2013-09-13,https://www.nrl.com/draw/nrl-premiership/2013/...,13,7
1399,1468,2013-09-14,https://www.nrl.com/draw/nrl-premiership/2013/...,4,10
1400,1469,2013-09-14,https://www.nrl.com/draw/nrl-premiership/2013/...,15,6
1401,1470,2013-09-15,https://www.nrl.com/draw/nrl-premiership/2013/...,3,8
1402,1471,2013-09-20,https://www.nrl.com/draw/nrl-premiership/2013/...,6,4
1403,1472,2013-09-21,https://www.nrl.com/draw/nrl-premiership/2013/...,7,8
1404,1473,2013-09-27,https://www.nrl.com/draw/nrl-premiership/2013/...,13,6
1405,1474,2013-09-28,https://www.nrl.com/draw/nrl-premiership/2013/...,15,8
1406,1475,2013-10-06,https://www.nrl.com/draw/nrl-premiership/2013/...,15,6


# 2. Scraping Functions

In [174]:
#Finds team id based off nickname
def find_team_id(name):
    find_team_query = 'SELECT id FROM Teams WHERE nickname = %s;'
    mycursor.execute(find_team_query, (name,))
    return mycursor.fetchone()[0]

#Finds player based off name or creates entry in database for player
def find_or_create_player(first_name, last_name, team_id):
    find_player_query = 'SELECT id FROM Players WHERE first_name = %s AND last_name LIKE %s AND current_team = %s LIMIT 1;'
    mycursor.execute(find_player_query, (first_name, '%' + last_name + '%', team_id))
    result = mycursor.fetchone()
    if result is None:
        insert_player_query = 'INSERT INTO Players (first_name, last_name, current_team) VALUES (%s, %s, %s);'
        data = (first_name, last_name, team_id)
        mycursor.execute(insert_player_query, data)
        mydb.commit()
        result = find_or_create_player(first_name, last_name, team_id)
        return int(result)
    else:
        result = result[0]
        return result

In [156]:
#Function to scrape each match
def scrape_match(match, web_driver):
    match_key = match['url'].split('nrl-premiership/')[1][:-1]
    for char in ['-vs-', '-v-', '/', '-']:
        match_key = match_key.replace(char, '_')
    results = {}
    
    #Use Selenium WebDriver to scrape data and automate the process of flipping through URLs
    web_driver.get(match['url'])
    for xpath in ['1', '2']:
        wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody')))
        for i in range(1, 20):
            try:
                name_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr['+ str(i) +']/td[2]/a').get_attribute('innerText').strip()
                first_name = name_field.split(' ')[0].strip().capitalize()
                last_name = name_field.split(' ')[-1].strip().capitalize()
                middle_name = name_field.split(' ')[-2].strip()
                if middle_name.isalpha():
                    last_name = middle_name.capitalize() + ' ' + last_name
                if xpath == '1':
                    team_id = match['home_team_id']
                elif xpath == '2':
                    team_id = match['away_team_id']
                full_name = first_name + '_' + last_name + '_' + str(team_id)
                player_id = find_or_create_player(first_name, last_name, str(team_id))
                    
                player_stat_list = []
                player_stat_list.append(player_id)
                player_stat_list.append(team_id)
                player_stat_list.append(match['id'])
                    
                for column in range(3, 67):
                    if column in [5, 7, 15, 17, 21, 34, 40, 47, 56, 64]:
                        continue
                    else:
                        stat_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr[' + str(i) + ']/td[' + str(column) + ']')
                        player_stat_list.append(stat_field.get_attribute('innerText').strip())
                print(player_stat_list)
                results[full_name] = player_stat_list
            except:
                continue
    print('Scraping Success: ' + match['url'])
    return results

In [157]:
#Function to save each match's data to a CSV for insertion into the DB
def save_data_to_csv(match, match_dict):
    column_names = ['player_id', 'team_id', 'match_id', 'number', 'position', 'minutes_played', 'points', 'tries',
                'conversions','conversion_attempts', 'penalty_goals', 'conversion_percentage','field_goals',
                'fantasy_points', 'total_runs', 'total_run_metres', 'kick_return_metres', 'post_contact_metres',
                'line_breaks', 'line_break_assists', 'try_assists', 'line_engaged_runs', 'tackle_breaks', 'hit_ups',
                'play_the_ball', 'average_play_the_ball_seconds', 'dummy_half_runs', 'dummy_half_run_metres', 
                'steals', 'offloads', 'dummy_passes', 'passes', 'receipts', 'pass_to_run_ratio', 'tackle_percentage',
                'tackles_made', 'tackles_missed', 'ineffective_tackles', 'intercepts', 'kicks_defused', 'kicks',
                'kicking_metres', 'forced_drop_outs', 'bomb_kicks', 'grubbers', 'fourty_twenty',
                'cross_field_kicks', 'kicked_dead', 'errors', 'handling_errors', 'one_on_ones_lost', 'penalties',
                'on_report', 'sin_bins', 'send_offs', 'stint_one', 'stint_two']
    year = match['date'].year
    month = match['date'].month
    match_id = match['id']
    csv_filename = str(year) + '_' + str(month) + '_' + 'MatchID_' + str(match_id) + '.csv'
    #print(csv_filename)
    
    csv_data = pd.DataFrame.from_dict(match_dict, orient='index', columns=column_names).reset_index()
    print(csv_data)
    csv_data = csv_data.replace('-', 0).replace({pd.np.nan: 0})
    
    yeardir = './csv_files/' + str(year)
    if not os.path.exists(yeardir):
        os.mkdir(yeardir)
    monthdir = yeardir + '/' + str(month)
    if not os.path.exists(monthdir):
        os.mkdir(monthdir)
    csv_data.to_csv(monthdir + '/' + csv_filename)
    print("CSV success: " + str(match_id))

# 3. Scraping Process

In [None]:
#Set Up WebDriver
driver = webdriver.Chrome(ChromeDriverManager().install())
wait = WebDriverWait(driver, 10)

In [159]:
scraping_errors = []
csv_conversion_errors = []

In [None]:
for match in not_yet_scraped_df.iterrows():
    try:
        
        #For each match, run the scraping function from step 2
        match_data = scrape_match(match[1], driver)
        try:
            #If successful, run the save_data_to_csv function from step 2
            save_data_to_csv(match[1], match_data)
        
        except:
            #If unsuccessful, save URL to a CSV conversion error list
            print("CSV error: " + str(match[1]['id']))
            csv_conversion_errors.append(match[1]['id'])
    
    except:
        #If the scraping function fails at any step, save URL to scraping_error list
        print('scraping error: ' + str(match[1]['url']))
        scraping_errors.append(match[1]['url'])

# 3. Update URLs for matches which could not be scraped

In [169]:
#Get match ids of errored URLs for manual updating
find_incorrect_url_matches = 'SELECT id, url FROM Matches WHERE url IN {};'.format(tuple(scraping_errors))
mycursor.execute(find_incorrect_url_matches,)
results = mycursor.fetchall()
print(results)

[(1467, 'https://www.nrl.com/draw/nrl-premiership/2013/finals-week-1/rabbitohs-vs-storm/'), (1468, 'https://www.nrl.com/draw/nrl-premiership/2013/finals-week-1/sharks-vs-cowboys/'), (1469, 'https://www.nrl.com/draw/nrl-premiership/2013/finals-week-1/roosters-vs-sea-eagles/'), (1470, 'https://www.nrl.com/draw/nrl-premiership/2013/finals-week-1/bulldogs-vs-knights/'), (1471, 'https://www.nrl.com/draw/nrl-premiership/2013/finals-week-2/sea-eagles-vs-sharks/'), (1472, 'https://www.nrl.com/draw/nrl-premiership/2013/finals-week-2/storm-vs-knights/'), (1473, 'https://www.nrl.com/draw/nrl-premiership/2013/finals-week-3/rabbitohs-vs-sea-eagles/'), (1474, 'https://www.nrl.com/draw/nrl-premiership/2013/finals-week-3/roosters-vs-knights/'), (1475, 'https://www.nrl.com/draw/nrl-premiership/2013/grand-final/roosters-vs-sea-eagles/')]


In [170]:
#Manually update URLs which could not be scraped
mycursor.executemany("UPDATE Matches SET url = %s WHERE id = %s",
                    [("https://www.nrl.com/draw/nrl-premiership/2013/round-27/rabbitohs-vs-storm/", "1467"),
                    ("https://www.nrl.com/draw/nrl-premiership/2013/round-27/sharks-vs-cowboys/", "1468"),
                    ("https://www.nrl.com/draw/nrl-premiership/2013/round-27/roosters-vs-sea-eagles/", "1469"),
                    ("https://www.nrl.com/draw/nrl-premiership/2013/round-27/knights-vs-bulldogs/", "1470"),
                    ("https://www.nrl.com/draw/nrl-premiership/2013/round-28/sea-eagles-vs-sharks/", "1471"),
                    ("https://www.nrl.com/draw/nrl-premiership/2013/round-28/storm-vs-knights/", "1472"),
                    ("https://www.nrl.com/draw/nrl-premiership/2013/round-29/rabbitohs-vs-sea-eagles/", "1473"),
                    ("https://www.nrl.com/draw/nrl-premiership/2013/round-29/roosters-vs-knights/", "1474"),
                    ("https://www.nrl.com/draw/nrl-premiership/2013/round-30/roosters-vs-sea-eagles/", "1475")])
mydb.commit()

# 4. Repeat process from step 1

After several iterations, I successfully created CSV files of data for each match. The two issues that caused errors were:
1. Incorrerct URL -> Resolution was to update the URL
2. Selenium timeouot error on the URL -> Resolution was to simply re-scrape the page

In [149]:
mycursor.close()

True