# Match Stat Scraper

In [1]:
#Scraping Imports
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
#SQL Imports
import mysql.connector
#Pandas imports
import pandas as pd

In [2]:
#DB Connection
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  passwd="NYg1@nts",
  database="NRL_data"
)
mycursor = mydb.cursor(buffered=True)

## Get all URL's of matches with missing data

In [3]:
#Get all matches
all_matches_query = pd.read_sql_query('SELECT id, date, url, home_team_id, away_team_id FROM Matches;', mydb)
all_match_df = pd.DataFrame(all_matches_query, columns=['id', 'date', 'url', 'home_team_id', 'away_team_id'])

#Find matches that were already scraped
already_scraped = 'SELECT DISTINCT match_id FROM PlayerMatchStats;'
mycursor.execute(already_scraped,)
results = mycursor.fetchall()
already_scraped_list = list(map(lambda x: x[0], results))

#Remove matches which were already scraped and save remaining match info to not_yet_scraped_df
not_yet_scraped = set(list(all_match_df['id'])) - set(already_scraped_list)
not_yet_scraped_df = all_match_df[all_match_df['id'].isin(not_yet_scraped)]
#print(not_yet_scraped_df)

In [4]:
#Instantiate a blank dictionary of matches that need to be scraped by year
scraping_dict = {}

#Convert 'date' column of not yet_scraped_df to datetime
not_yet_scraped_df['date'] = pd.to_datetime(not_yet_scraped_df['date'])

#Populate scraping_dict with key = year and values = matches for that year
scraping_dict = {}
for year in list(not_yet_scraped_df['date'].dt.year.unique()):
    scraping_dict[year] = not_yet_scraped_df[not_yet_scraped_df['date'].dt.year == year]

#Validate output with 2018 as an example
#scraping_dict[2018]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [5]:
#Create dictionary to store scraping results
player_match_stats = dict.fromkeys(scraping_dict.keys(), {})
player_match_stats

{2019: {}, 2017: {}, 2016: {}, 2015: {}, 2014: {}, 2013: {}}

# Scraping Process

## Create functions to identify team and players by id

In [6]:
def find_team_id(name):
    find_team_query = 'SELECT id FROM Teams WHERE nickname = %s;'
    mycursor.execute(find_team_query, (name,))
    return mycursor.fetchone()[0]

def find_or_create_player(first_name, last_name, team_id):
    find_player_query = 'SELECT id FROM Players WHERE first_name = %s AND last_name LIKE %s AND current_team = %s LIMIT 1;'
    mycursor.execute(find_player_query, (first_name, '%' + last_name + '%', team_id))
    result = mycursor.fetchone()
    if result is None:
        insert_player_query = 'INSERT INTO Players (first_name, last_name, current_team) VALUES (%s, %s, %s);'
        data = (first_name, last_name, team_id)
        #print(data)
        mycursor.execute(insert_player_query, data)
        mydb.commit()
        result = find_or_create_player(first_name, last_name, team_id)
        return int(result)
    else:
        result = result[0]
        #print('id = ' + str(result))
        return result

In [27]:
def scrape_match(match, results_dict, web_driver):
    print(match)
    match = match[1]
    print(match)
    try:
        match_key = match['url'].split(str(year) + '/')[1][:-1]
        for char in ['-vs-', '-v-', '/', '-']:
            match_key = match_key.replace(char, '_')
    except:
        match_key = match['url'].split('/')[1][:-1]
    results_dict[match_key] = {}
        
    try:
        print(match['url'])
        print(web_driver)
        web_driver.get(match['url'])
        for xpath in ['1', '2']:
            wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody')))
            for i in range(1, 18):
                name_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr['+ str(i) +']/td[2]/a').get_attribute('innerText').strip()
                first_name = name_field.split(' ')[0].strip().capitalize()
                last_name = name_field.split(' ')[-1].strip().capitalize()
                middle_name = name_field.split(' ')[-2].strip()
                if middle_name.isalpha():
                    last_name = middle_name.capitalize() + ' ' + last_name
                if xpath == '1':
                    team_id = match['home_team_id']
                elif xpath == '2':
                    team_id = match['away_team_id']
                full_name = first_name + '_' + last_name + '_' + str(team_id)
                player_id = find_or_create_player(first_name, last_name, str(team_id))
                    
                player_stat_list = []
                player_stat_list.append(player_id)
                player_stat_list.append(team_id)
                player_stat_list.append(match['id'])
                    
                for column in range(3, 67):
                    if column in [5, 7, 15, 17, 21, 34, 40, 47, 56, 64]:
                        continue
                    else:
                        stat_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr[' + str(i) + ']/td[' + str(column) + ']')
                        player_stat_list.append(stat_field.get_attribute('innerText').strip())
                print(player_stat_list)
                results_dict[match_key][full_name] = player_stat_list
        return results_dict
    except:
        print('error: ' + match['url'])
        results_dict['errors'].append(match['url'])
        return results_dict

In [8]:
#Set Up WebDriver
driver = webdriver.Chrome(ChromeDriverManager().install())
wait = WebDriverWait(driver, 10)


Checking for mac64 chromedriver:83.0.4103.39 in cache
Driver found in /Users/nickpowers/.wdm/chromedriver/83.0.4103.39/mac64/chromedriver


In [9]:
def save_data_to_csv(year, month, results_dict):
    column_names = ['player_id', 'team_id', 'match_id', 'number', 'position', 'minutes_played', 'points', 'tries',
                'conversions','conversion_attempts', 'penalty_goals', 'conversion_percentage','field_goals',
                'fantasy_points', 'total_runs', 'total_run_metres', 'kick_return_metres', 'post_contact_metres',
                'line_breaks', 'line_break_assists', 'try_assists', 'line_engaged_runs', 'tackle_breaks', 'hit_ups',
                'play_the_ball', 'average_play_the_ball_seconds', 'dummy_half_runs', 'dummy_half_run_metres', 
                'steals', 'offloads', 'dummy_passes', 'passes', 'receipts', 'pass_to_run_ratio', 'tackle_percentage',
                'tackles_made', 'tackles_missed', 'ineffective_tackles', 'intercepts', 'kicks_defused', 'kicks',
                'kicking_metres', 'forced_drop_outs', 'bomb_kicks', 'grubbers', 'fourty_twenty',
                'cross_field_kicks', 'kicked_dead', 'errors', 'handling_errors', 'one_on_ones_lost', 'penalties',
                'on_report', 'sin_bins', 'send_offs', 'stint_one', 'stint_two']
    csv_data = pd.DataFrame(columns = column_names)
    csv_filename = str(year) + '_' + str(month) + '_data.csv'
    try:
        for match in results_dict.keys():
            csv_data = csv_data.append(pd.DataFrame.from_dict(results_dict[match], orient='index'))
        csv_data = csv_data.replace('-', 0).replace({pd.np.nan: 0})
        csv_data.to_csv('./csv_files/' + csv_filename)
        print("success: " + csv_filename)
    except:
        print("error: " + csv_filename)

In [33]:
#test_df = scraping_dict[2019][:1]
#print(test_df)
test_df_errors = scraping_dict[2019].tail(1)
test_df_errors['url'] = 'https://www.nrl.com/error'
print(test_df_errors)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


      id       date                        url  home_team_id  away_team_id
392  461 2019-10-06  https://www.nrl.com/error            15             2


In [38]:
error_total = []

for year in [2019]:
#for year in player_match_stats.keys():
    year_df = scraping_dict[year]
    for month in ['03']:
    #for month in list(year_df['date'].dt.strftime("%m").unique()):
        monthly_results = {}
        monthly_results['errors'] = []
        month_mask = year_df['date'].map(lambda x: x.month) == int(month)
        month_df = year_df[month_mask]
        
        for match in test_df_errors.iterrows():
        #for match in test_df.iterrows():
        #for match in month_df:
            scrape_match(match, monthly_results, driver)
        
        ##UPDATE ALL OF THIS BELOW
        #player_match_stats[year] = year_dict
        print(monthly_results['errors'])
        if monthly_results['errors']:
            if error_total:
                error_total = error_total.extend(monthly_results['errors'])
            else:
                error_total = monthly_results['errors']
            monthly_results = monthly_results.pop('errors')
        save_data_to_csv(year, month, monthly_results)
        print(error_total)

(392, id                                    461
date                  2019-10-06 00:00:00
url             https://www.nrl.com/error
home_team_id                           15
away_team_id                            2
Name: 392, dtype: object)
id                                    461
date                  2019-10-06 00:00:00
url             https://www.nrl.com/error
home_team_id                           15
away_team_id                            2
Name: 392, dtype: object
https://www.nrl.com/error
<selenium.webdriver.chrome.webdriver.WebDriver (session="6ab62a93c26e526e0d84285a2e6a7050")>
error: https://www.nrl.com/error
['https://www.nrl.com/error']
['https://www.nrl.com/error']
[]
error: 2019_03_data.csv


In [39]:
error_total

['https://www.nrl.com/error']

## Manually update error URLs

In [9]:
#errors = player_match_stats[2018]['errors']
##player_match_stats[2018].pop('errors')

In [10]:
#Get match ids of errored URLs for manual updating
find_incorrect_url_matches = 'SELECT id, url FROM Matches WHERE url IN {};'.format(tuple(error_total))
mycursor.execute(find_incorrect_url_matches,)
results = mycursor.fetchall()
print(results)

[(465, 'https://www.nrl.com/draw/nrl-premiership/2018/finals-week-1/broncos-vs-dragons/'), (466, 'https://www.nrl.com/draw/nrl-premiership/2018/finals-week-2/sharks-vs-panthers/'), (467, 'https://www.nrl.com/draw/nrl-premiership/2018/finals-week-2/rabbitohs-vs-dragons/'), (468, 'https://www.nrl.com/draw/nrl-premiership/2018/finals-week-3/storm-vs-sharks/'), (469, 'https://www.nrl.com/draw/nrl-premiership/2018/finals-week-3/roosters-vs-rabbitohs/')]


In [11]:
#Manually update URLs
mycursor.executemany("UPDATE Matches SET url = %s WHERE id = %s",
                    [("https://www.nrl.com/draw/nrl-premiership/2018/finals-week-2/game-1/", "466"),
                    ("https://www.nrl.com/draw/nrl-premiership/2018/finals-week-2/game-12/", "467"),
                    ("https://www.nrl.com/draw/nrl-premiership/2018/finals-week-3/game-1/", "468"),
                    ("https://www.nrl.com/draw/nrl-premiership/2018/finals-week-3/game-12/", "469")])
mydb.commit()

In [None]:
errored_match_query = pd.read_sql_query('SELECT id, date, url, home_team_id, away_team_id FROM Matches WHERE id IN (466, 467, 468, 469);', mydb)
errored_match_df = pd.DataFrame(errored_match_query, columns=['id', 'date', 'url', 'home_team_id', 'away_team_id'])

errored_match_dict = {}
errored_match_dict['errors'] = []
for match in errored_match_df.iterrows():
    scrape_match(match, errored_match_dict, driver)
extra_errors = errored_match_dict['errors']
successful_matches = errored_match_dict.pop('errors')

csv_errored_matches = pd.DataFrame(columns=column_names)
for match in successful_matches.keys():
    csv_errored_matches = csv_errored_matches.append(pd.DataFrame.from_dict(successful_matches[match], orient='index'))
csv_errored_matches = csv_errored_matches.replace('-', 0).replace({pd.np.nan: 0})
csv_errored_matches.to_csv('./csv_files/errored_data.csv')

In [16]:
player_match_stats[2018].keys()
player_match_stats[2018] = player_match_stats[2018].pop('errors')

In [31]:
player_match_stats[2018] = year_dict
player_match_stats[2018]
stats_dict = pd.DataFrame()
for match in player_match_stats[2018].keys():
    match_df = pd.DataFrame.from_dict(player_match_stats[2018][match], orient='index')
    stats_dict = stats_dict.append(match_df)
stats_dict

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
Billy_Slater_7,568,7,462,1,Fullback,80:00,-,-,-,-,...,-,1,1,-,-,-,-,-,80:00,-
Suliasi_Vunivalu_7,185,7,462,2,Winger,80:00,8,2,-,-,...,-,2,1,-,-,-,-,-,80:00,-
Cheyse_Blair_7,623,7,462,3,Centre,80:00,8,2,-,-,...,-,-,-,-,-,-,-,-,80:00,-
Curtis_Scott_7,177,7,462,4,Centre,80:00,4,1,-,-,...,-,-,-,-,1,-,-,-,80:00,-
Josh_Addo-carr_7,179,7,462,5,Winger,80:00,-,-,-,-,...,-,3,3,-,1,-,-,-,80:00,-
Cameron_Munster_7,175,7,462,6,Five-Eighth,80:00,1,-,-,-,...,1,-,-,-,-,-,-,-,80:00,-
Jesse_Bromwich_7,164,7,462,8,Prop,55:00,-,-,-,-,...,-,1,1,-,-,-,-,-,25:49,28:39
Cameron_Smith_7,159,7,462,9,Hooker,80:00,8,-,3,5,...,-,-,-,-,2,-,-,-,80:00,-
Tim_Glasby_7,541,7,462,10,Prop,33:00,-,-,-,-,...,-,-,-,-,-,-,-,-,26:46,06:27
Felise_Kaufusi_7,162,7,462,11,2nd Row,80:00,-,-,-,-,...,-,-,-,-,-,-,-,-,80:00,-


## Run scraping code for updated match URLs

In [32]:
errored_match_query = pd.read_sql_query('SELECT id, date, url, home_team_id, away_team_id FROM Matches WHERE id IN (466, 467, 468, 469);', mydb)
errored_match_df = pd.DataFrame(errored_match_query, columns=['id', 'date', 'url', 'home_team_id', 'away_team_id'])

In [33]:
errored_match_df

Unnamed: 0,id,date,url,home_team_id,away_team_id
0,466,2018-09-14,https://www.nrl.com/draw/nrl-premiership/2018/...,4,12
1,467,2018-09-15,https://www.nrl.com/draw/nrl-premiership/2018/...,13,14
2,468,2018-09-21,https://www.nrl.com/draw/nrl-premiership/2018/...,7,4
3,469,2018-09-22,https://www.nrl.com/draw/nrl-premiership/2018/...,15,13


In [44]:
errored_match_query = pd.read_sql_query('SELECT id, date, url, home_team_id, away_team_id FROM Matches WHERE id = 465;', mydb)
errored_match_df = pd.DataFrame(errored_match_query, columns=['id', 'date', 'url', 'home_team_id', 'away_team_id'])
for match in errored_match_df.iterrows():
        match = match[1]
        match_key = match['url'].split(str(year) + '/')[1][:-1]
        for char in ['-vs-', '-v-', '/', '-']:
            match_key = match_key.replace(char, '_')
        year_dict[match_key] = {}
        
        try:
            print(match['url'])
            driver.get(match['url'])
            #home_xpath_div = '1', away_xpath_div = '2'
            for xpath in ['1', '2']:
                wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody')))
                for i in range(1, 18):
                    name_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr['+ str(i) +']/td[2]/a').get_attribute('innerText').strip()
                    first_name = name_field.split(' ')[0].strip().capitalize()
                    last_name = name_field.split(' ')[-1].strip().capitalize()
                    middle_name = name_field.split(' ')[-2].strip()
                    if middle_name.isalpha():
                        last_name = middle_name.capitalize() + ' ' + last_name
                    if xpath == '1':
                        team_id = match['home_team_id']
                    elif xpath == '2':
                        team_id = match['away_team_id']
                    full_name = first_name + '_' + last_name + '_' + str(team_id)
                    player_id = find_or_create_player(first_name, last_name, str(team_id))
                    
                    player_stat_list = []
                    player_stat_list.append(player_id)
                    player_stat_list.append(team_id)
                    player_stat_list.append(match['id'])
                    
                    for column in range(3, 67):
                        if column in [5, 7, 15, 17, 21, 34, 40, 47, 56, 64]:
                            continue
                        else:
                            stat_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr[' + str(i) + ']/td[' + str(column) + ']')
                            player_stat_list.append(stat_field.get_attribute('innerText').strip())
                    print(player_stat_list)
                    year_dict[match_key][full_name] = player_stat_list
        except:
            print('error: ' + match['url'])
            year_dict['errors'].append(match['url'])
        print(year_dict[match_key])

https://www.nrl.com/draw/nrl-premiership/2018/finals-week-1/broncos-vs-dragons/
[22, 1, 465, '1', 'Fullback', '80:00', '4', '1', '-', '-', '-', '-', '-', '41', '12', '107', '40', '16', '1', '-', '-', '-', '6', '-', '4', '3.16s', '2', '23', '-', '-', '1', '20', '27', '1.67', '62.5%', '5', '2', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80:00', '-']
[20, 1, 465, '2', 'Winger', '80:00', '-', '-', '-', '-', '-', '-', '-', '15', '10', '94', '43', '47', '-', '-', '-', '-', '2', '-', '10', '2.84s', '-', '-', '-', '-', '-', '-', '12', '-', '80%', '4', '-', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '2', '2', '-', '-', '-', '-', '-', '80:00', '-']
[27, 1, 465, '3', 'Centre', '80:00', '-', '-', '-', '-', '-', '-', '-', '46', '8', '103', '-', '15', '-', '-', '-', '-', '8', '-', '7', '3.18s', '-', '-', '-', '-', '-', '1', '8', '0.13', '88.9%', '16', '2', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', 

{'Darius_Boyd_1': [22, 1, 465, '1', 'Fullback', '80:00', '4', '1', '-', '-', '-', '-', '-', '41', '12', '107', '40', '16', '1', '-', '-', '-', '6', '-', '4', '3.16s', '2', '23', '-', '-', '1', '20', '27', '1.67', '62.5%', '5', '2', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80:00', '-'], 'Corey_Oates_1': [20, 1, 465, '2', 'Winger', '80:00', '-', '-', '-', '-', '-', '-', '-', '15', '10', '94', '43', '47', '-', '-', '-', '-', '2', '-', '10', '2.84s', '-', '-', '-', '-', '-', '-', '12', '-', '80%', '4', '-', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '2', '2', '-', '-', '-', '-', '-', '80:00', '-'], 'James_Roberts_1': [27, 1, 465, '3', 'Centre', '80:00', '-', '-', '-', '-', '-', '-', '-', '46', '8', '103', '-', '15', '-', '-', '-', '-', '8', '-', '7', '3.18s', '-', '-', '-', '-', '-', '1', '8', '0.13', '88.9%', '16', '2', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80:00', '-'], 'Jor

In [45]:
player_match_stats[2018] = year_dict
player_match_stats[2018]
stats_dict = pd.DataFrame()
for match in player_match_stats[2018].keys():
    match_df = pd.DataFrame.from_dict(player_match_stats[2018][match], orient='index')
    stats_dict = stats_dict.append(match_df)
stats_dict

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
Billy_Slater_7,568,7,462,1,Fullback,80:00,-,-,-,-,...,-,1,1,-,-,-,-,-,80:00,-
Suliasi_Vunivalu_7,185,7,462,2,Winger,80:00,8,2,-,-,...,-,2,1,-,-,-,-,-,80:00,-
Cheyse_Blair_7,623,7,462,3,Centre,80:00,8,2,-,-,...,-,-,-,-,-,-,-,-,80:00,-
Curtis_Scott_7,177,7,462,4,Centre,80:00,4,1,-,-,...,-,-,-,-,1,-,-,-,80:00,-
Josh_Addo-carr_7,179,7,462,5,Winger,80:00,-,-,-,-,...,-,3,3,-,1,-,-,-,80:00,-
Cameron_Munster_7,175,7,462,6,Five-Eighth,80:00,1,-,-,-,...,1,-,-,-,-,-,-,-,80:00,-
Jesse_Bromwich_7,164,7,462,8,Prop,55:00,-,-,-,-,...,-,1,1,-,-,-,-,-,25:49,28:39
Cameron_Smith_7,159,7,462,9,Hooker,80:00,8,-,3,5,...,-,-,-,-,2,-,-,-,80:00,-
Tim_Glasby_7,541,7,462,10,Prop,33:00,-,-,-,-,...,-,-,-,-,-,-,-,-,26:46,06:27
Felise_Kaufusi_7,162,7,462,11,2nd Row,80:00,-,-,-,-,...,-,-,-,-,-,-,-,-,80:00,-


In [47]:
stats_dict[2].value_counts()

470    34
469    34
468    34
467    34
466    34
465    34
464    34
463    34
462    34
Name: 2, dtype: int64

In [49]:
column_names = ['player_id', 'team_id', 'match_id', 'number', 'position', 'minutes_played', 'points', 'tries',
                'conversions','conversion_attempts', 'penalty_goals', 'conversion_percentage','field_goals',
                'fantasy_points', 'total_runs', 'total_run_metres', 'kick_return_metres', 'post_contact_metres',
                'line_breaks', 'line_break_assists', 'try_assists', 'line_engaged_runs', 'tackle_breaks', 'hit_ups',
                'play_the_ball', 'average_play_the_ball_seconds', 'dummy_half_runs', 'dummy_half_run_metres', 
                'steals', 'offloads', 'dummy_passes', 'passes', 'receipts', 'pass_to_run_ratio', 'tackle_percentage',
                'tackles_made', 'tackles_missed', 'ineffective_tackles', 'intercepts', 'kicks_defused', 'kicks',
                'kicking_metres', 'forced_drop_outs', 'bomb_kicks', 'grubbers', 'fourty_twenty',
                'cross_field_kicks', 'kicked_dead', 'errors', 'handling_errors', 'one_on_ones_lost', 'penalties',
                'on_report', 'sin_bins', 'send_offs', 'stint_one', 'stint_two']

player_match_stats[2018] = year_dict
player_match_stats[2018]
year_df = pd.DataFrame()
for match in player_match_stats[2018].keys():
    match_df = pd.DataFrame.from_dict(player_match_stats[2018][match], orient='index', columns=column_names).reset_index()
    year_df = year_df.append(match_df, ignore_index = True)
year_df = year_df.replace('-', 0).replace({pd.np.nan: 0})
#year_df
year_df.to_csv('./csv_files/' + str(year) + '_data.csv')



In [50]:
mycursor.close()

True