# Scrape Match Data - 2018

When I originally worked on this project, I only scraped data for the regular season of 2018 and part of 2019. The below code finishes scraping for the 2018 post-season.

In [3]:
#Scraping Imports
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
#SQL Imports
import mysql.connector
#Pandas imports
import pandas as pd

In [4]:
#DB Connection
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  passwd="",
  database="NRL_data"
)
mycursor = mydb.cursor(buffered=True)

## Get all URL's of matches with missing data

In [5]:
#Get all matches
all_matches_query = pd.read_sql_query('SELECT id, date, url, home_team_id, away_team_id FROM Matches;', mydb)
all_match_df = pd.DataFrame(all_matches_query, columns=['id', 'date', 'url', 'home_team_id', 'away_team_id'])

#Find matches that were already scraped
already_scraped = 'SELECT DISTINCT match_id FROM PlayerMatchStats;'
mycursor.execute(already_scraped,)
results = mycursor.fetchall()
already_scraped_list = list(map(lambda x: x[0], results))

#Remove matches which were already scraped and save remaining match info to not_yet_scraped_df
not_yet_scraped = set(list(all_match_df['id'])) - set(already_scraped_list)
not_yet_scraped_df = all_match_df[all_match_df['id'].isin(not_yet_scraped)]
#print(not_yet_scraped_df)

In [6]:
#Instantiate a blank dictionary of matches that need to be scraped by year
scraping_dict = {}

#Convert 'date' column of not yet_scraped_df to datetime
not_yet_scraped_df['date'] = pd.to_datetime(not_yet_scraped_df['date'])

#Populate scraping_dict with key = year and values = matches for that year
scraping_dict = {}
for year in list(not_yet_scraped_df['date'].dt.year.unique()):
    scraping_dict[year] = not_yet_scraped_df[not_yet_scraped_df['date'].dt.year == year]

#Validate output with 2018 as an example
#scraping_dict[2018]

In [7]:
#Create dictionary to store scraping results
player_match_stats = dict.fromkeys(scraping_dict.keys(), {})

{}

# Scraping Process

## Create functions to identify team and players by id

In [8]:
def find_team_id(name):
    find_team_query = 'SELECT id FROM Teams WHERE nickname = %s;'
    mycursor.execute(find_team_query, (name,))
    return mycursor.fetchone()[0]

def find_or_create_player(first_name, last_name, team_id):
    find_player_query = 'SELECT id FROM Players WHERE first_name = %s AND last_name LIKE %s AND current_team = %s LIMIT 1;'
    mycursor.execute(find_player_query, (first_name, '%' + last_name + '%', team_id))
    result = mycursor.fetchone()
    if result is None:
        insert_player_query = 'INSERT INTO Players (first_name, last_name, current_team) VALUES (%s, %s, %s);'
        data = (first_name, last_name, team_id)
        #print(data)
        mycursor.execute(insert_player_query, data)
        mydb.commit()
        result = find_or_create_player(first_name, last_name, team_id)
        return int(result)
    else:
        result = result[0]
        #print('id = ' + str(result))
        return result

In [None]:
#Set Up WebDriver
driver = webdriver.Chrome(ChromeDriverManager().install())
wait = WebDriverWait(driver, 10)

In [None]:
for year in [2018]:
    year_dict = {}
    
    #Create a blank list to store URLs where scraping fails
    year_dict['errors'] = []
    for match in scraping_dict[year].iterrows():
        match = match[1]
        
        #Create match key in the format "round + home_team + _ + away_team"
        match_key = match['url'].split(str(year) + '/')[1][:-1]
        for char in ['-vs-', '-v-', '/', '-']:
            match_key = match_key.replace(char, '_')
        year_dict[match_key] = {}
        
        try:
            print(match['url'])
            
            #Use Selenium WebDriver to scrape data and automate the process of flipping through URLs
            driver.get(match['url'])
            #home_xpath_div = '1', away_xpath_div = '2'
            for xpath in ['1', '2']:
                wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody')))
                for i in range(1, 18):
                    name_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr['+ str(i) +']/td[2]/a').get_attribute('innerText').strip()
                    first_name = name_field.split(' ')[0].strip().capitalize()
                    last_name = name_field.split(' ')[-1].strip().capitalize()
                    middle_name = name_field.split(' ')[-2].strip()
                    if middle_name.isalpha():
                        last_name = middle_name.capitalize() + ' ' + last_name
                    if xpath == '1':
                        team_id = match['home_team_id']
                    elif xpath == '2':
                        team_id = match['away_team_id']
                    full_name = first_name + '_' + last_name + '_' + str(team_id)
                    player_id = find_or_create_player(first_name, last_name, str(team_id))
                    
                    player_stat_list = []
                    player_stat_list.append(player_id)
                    player_stat_list.append(team_id)
                    player_stat_list.append(match['id'])
                    
                    for column in range(3, 67):
                        if column in [5, 7, 15, 17, 21, 34, 40, 47, 56, 64]:
                            continue
                        else:
                            stat_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr[' + str(i) + ']/td[' + str(column) + ']')
                            player_stat_list.append(stat_field.get_attribute('innerText').strip())
                    print(player_stat_list)
                    year_dict[match_key][full_name] = player_stat_list
        except:
            print('error: ' + match['url'])
            year_dict['errors'].append(match['url'])
        print(year_dict[match_key])
    
    csv_data = pd.DataFrame.from_dict(year_dict, orient='index').replace('-', 0)
    #csv_data = pd.DataFrame.from_dict(year_dict, orient='index', columns=column_names).replace('-', 0)
    csv_data = csv_data.replace({pd.np.nan: 0})

    #print(csv_data)
    #print(year_dict)
    player_match_stats[year] = year_dict

## Manually update error URLs

In [None]:
#Get match ids of errored URLs for manual updating
errors = player_match_stats[2018]['errors']

find_incorrect_url_matches = 'SELECT id, url FROM Matches WHERE url IN {};'.format(tuple(errors))
mycursor.execute(find_incorrect_url_matches,)
results = mycursor.fetchall()
#print(results)

In [11]:
#Manually update URLs which couldn't be scraped

mycursor.executemany("UPDATE Matches SET url = %s WHERE id = %s",
                    [("https://www.nrl.com/draw/nrl-premiership/2018/finals-week-2/game-1/", "466"),
                    ("https://www.nrl.com/draw/nrl-premiership/2018/finals-week-2/game-12/", "467"),
                    ("https://www.nrl.com/draw/nrl-premiership/2018/finals-week-3/game-1/", "468"),
                    ("https://www.nrl.com/draw/nrl-premiership/2018/finals-week-3/game-12/", "469")])
mydb.commit()

In [16]:
#Remove errors from match dictionary
player_match_stats[2018].keys()
player_match_stats[2018] = player_match_stats[2018].pop('errors')

In [None]:
#convert dictionary into Pandas dataframe for easy manipulation and upload
player_match_stats[2018] = year_dict
player_match_stats[2018]
stats_dict = pd.DataFrame()
for match in player_match_stats[2018].keys():
    match_df = pd.DataFrame.from_dict(player_match_stats[2018][match], orient='index')
    stats_dict = stats_dict.append(match_df)
stats_dict

## Run scraping code for updated match URLs

In [32]:
errored_match_query = pd.read_sql_query('SELECT id, date, url, home_team_id, away_team_id FROM Matches WHERE id IN (466, 467, 468, 469);', mydb)
errored_match_df = pd.DataFrame(errored_match_query, columns=['id', 'date', 'url', 'home_team_id', 'away_team_id'])

In [33]:
errored_match_df

Unnamed: 0,id,date,url,home_team_id,away_team_id
0,466,2018-09-14,https://www.nrl.com/draw/nrl-premiership/2018/...,4,12
1,467,2018-09-15,https://www.nrl.com/draw/nrl-premiership/2018/...,13,14
2,468,2018-09-21,https://www.nrl.com/draw/nrl-premiership/2018/...,7,4
3,469,2018-09-22,https://www.nrl.com/draw/nrl-premiership/2018/...,15,13


In [12]:
#Re-run scraping query and append results to the DataFrame used to upload data

errored_match_query = pd.read_sql_query('SELECT id, date, url, home_team_id, away_team_id FROM Matches WHERE id = 465;', mydb)
errored_match_df = pd.DataFrame(errored_match_query, columns=['id', 'date', 'url', 'home_team_id', 'away_team_id'])
for match in errored_match_df.iterrows():
        match = match[1]
        match_key = match['url'].split(str(year) + '/')[1][:-1]
        for char in ['-vs-', '-v-', '/', '-']:
            match_key = match_key.replace(char, '_')
        year_dict[match_key] = {}
        
        try:
            print(match['url'])
            driver.get(match['url'])
            #home_xpath_div = '1', away_xpath_div = '2'
            for xpath in ['1', '2']:
                wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody')))
                for i in range(1, 18):
                    name_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr['+ str(i) +']/td[2]/a').get_attribute('innerText').strip()
                    first_name = name_field.split(' ')[0].strip().capitalize()
                    last_name = name_field.split(' ')[-1].strip().capitalize()
                    middle_name = name_field.split(' ')[-2].strip()
                    if middle_name.isalpha():
                        last_name = middle_name.capitalize() + ' ' + last_name
                    if xpath == '1':
                        team_id = match['home_team_id']
                    elif xpath == '2':
                        team_id = match['away_team_id']
                    full_name = first_name + '_' + last_name + '_' + str(team_id)
                    player_id = find_or_create_player(first_name, last_name, str(team_id))
                    
                    player_stat_list = []
                    player_stat_list.append(player_id)
                    player_stat_list.append(team_id)
                    player_stat_list.append(match['id'])
                    
                    for column in range(3, 67):
                        if column in [5, 7, 15, 17, 21, 34, 40, 47, 56, 64]:
                            continue
                        else:
                            stat_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr[' + str(i) + ']/td[' + str(column) + ']')
                            player_stat_list.append(stat_field.get_attribute('innerText').strip())
                    print(player_stat_list)
                    year_dict[match_key][full_name] = player_stat_list
        except:
            #print('error: ' + match['url'])
            year_dict['errors'].append(match['url'])
        #print(year_dict[match_key])

https://www.nrl.com/draw/nrl-premiership/2018/finals-week-1/broncos-vs-dragons/


In [None]:
player_match_stats[2018] = year_dict
player_match_stats[2018]
stats_dict = pd.DataFrame()
for match in player_match_stats[2018].keys():
    match_df = pd.DataFrame.from_dict(player_match_stats[2018][match], orient='index')
    stats_dict = stats_dict.append(match_df)
#stats_dict

In [49]:
#Update column names
column_names = ['player_id', 'team_id', 'match_id', 'number', 'position', 'minutes_played', 'points', 'tries',
                'conversions','conversion_attempts', 'penalty_goals', 'conversion_percentage','field_goals',
                'fantasy_points', 'total_runs', 'total_run_metres', 'kick_return_metres', 'post_contact_metres',
                'line_breaks', 'line_break_assists', 'try_assists', 'line_engaged_runs', 'tackle_breaks', 'hit_ups',
                'play_the_ball', 'average_play_the_ball_seconds', 'dummy_half_runs', 'dummy_half_run_metres', 
                'steals', 'offloads', 'dummy_passes', 'passes', 'receipts', 'pass_to_run_ratio', 'tackle_percentage',
                'tackles_made', 'tackles_missed', 'ineffective_tackles', 'intercepts', 'kicks_defused', 'kicks',
                'kicking_metres', 'forced_drop_outs', 'bomb_kicks', 'grubbers', 'fourty_twenty',
                'cross_field_kicks', 'kicked_dead', 'errors', 'handling_errors', 'one_on_ones_lost', 'penalties',
                'on_report', 'sin_bins', 'send_offs', 'stint_one', 'stint_two']

player_match_stats[2018] = year_dict
player_match_stats[2018]
year_df = pd.DataFrame()

#Export final results to CSV for insertion to db
for match in player_match_stats[2018].keys():
    match_df = pd.DataFrame.from_dict(player_match_stats[2018][match], orient='index', columns=column_names).reset_index()
    year_df = year_df.append(match_df, ignore_index = True)
year_df = year_df.replace('-', 0).replace({pd.np.nan: 0})
#year_df
year_df.to_csv('./csv_files/' + str(year) + '_data.csv')



In [50]:
mycursor.close()

True