In [25]:
#Scraping Imports
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager

In [26]:
#SQL Imports
import mysql.connector
#Pandas imports
import pandas as pd

In [27]:
column_names = ['player_id', 'team_id', 'match_id', 'number', 'position', 'minutes_played', 'points', 'tries',
                'conversions','conversion_attempts', 'penalty_goals', 'conversion_percentage','field_goals',
                'fantasy_points', 'total_runs', 'total_run_metres', 'kick_return_metres', 'post_contact_metres',
                'line_breaks', 'line_break_assists', 'try_assists', 'line_engaged_runs', 'tackle_breaks', 'hit_ups',
                'play_the_ball', 'average_play_the_ball_seconds', 'dummy_half_runs', 'dummy_half_run_metres', 
                'steals', 'offloads', 'dummy_passes', 'passes', 'receipts', 'pass_to_run_ratio', 'tackle_percentage',
                'tackles_made', 'tackles_missed', 'ineffective_tackles', 'intercepts', 'kicks_defused', 'kicks',
                'kicking_metres', 'forced_drop_outs', 'bomb_kicks', 'grubbers', 'fourty_twenty',
                'cross_field_kicks', 'kicked_dead', 'errors', 'handling_errors', 'one_on_ones_lost', 'penalties',
                'on_report', 'sin_bins', 'send_offs', 'stint_one', 'stint_two']

In [28]:
#DB Connection
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  passwd="NYg1@nts",
  database="NRL_data"
)
mycursor = mydb.cursor(buffered=True)

In [29]:
#Find team and player ids
def find_team_id(name):
    find_team_query = 'SELECT id FROM Teams WHERE nickname = %s;'
    mycursor.execute(find_team_query, (name,))
    return mycursor.fetchone()[0]

def find_position_id(name):
    find_position_query = 'SELECT id FROM Positions WHERE position_name = %s;'
    mycursor.execute(find_position_query, (name,))
    return mycursor.fetchone()[0]

In [30]:
def find_or_create_player(first_name, last_name, team_id):
    find_player_query = 'SELECT id FROM Players WHERE first_name = %s AND last_name LIKE %s AND current_team = %s LIMIT 1;'
    mycursor.execute(find_player_query, (first_name, '%' + last_name + '%', team_id))
    result = mycursor.fetchone()
    if result is None:
        insert_player_query = 'INSERT INTO Players (first_name, last_name, current_team) VALUES (%s, %s, %s);'
        data = (first_name, last_name, team_id)
        print(data)
        #mycursor.execute(insert_player_query, data)
        #mydb.commit()
        #result = find_or_create_player(first_name, last_name, team_id)
        #return int(result)
    else:
        result = result[0]
        print(result)
        #return int(result)

In [31]:
#1. Get URLs that need to be scraped
###errors = {'url': []}
#Find matches that were already scraped
already_scraped = 'SELECT DISTINCT match_id FROM PlayerMatchStats;'
mycursor.execute(already_scraped,)
results = mycursor.fetchall()
already_scraped_list = list(map(lambda x: x[0], results))

#Find all matches
all_matches_query = pd.read_sql_query('SELECT id, date, url, home_team_id, away_team_id FROM Matches;', mydb)
all_match_df = pd.DataFrame(all_matches_query, columns=['id', 'date', 'url', 'home_team_id', 'away_team_id'])

#Remove matches which were already scraped
not_yet_scraped = set(list(all_match_df['id'])) - set(already_scraped_list)
not_yet_scraped_df = all_match_df[all_match_df['id'].isin(not_yet_scraped)]
#print(not_yet_scraped_df)

In [32]:
#2. Set Up WebDriver
driver = webdriver.Chrome(ChromeDriverManager().install())
wait = WebDriverWait(driver, 10)


Checking for mac64 chromedriver:80.0.3987.106 in cache
Driver found in /Users/nickpowers/.wdm/chromedriver/80.0.3987.106/mac64/chromedriver


In [33]:
not_yet_scraped_df['date'] = pd.to_datetime(not_yet_scraped_df['date'])

scraping_dict = {}
for year in list(not_yet_scraped_df['date'].dt.year.unique()):
    scraping_dict[year] = not_yet_scraped_df[not_yet_scraped_df['date'].dt.year == year]
scraping_dict[2018]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,id,date,url,home_team_id,away_team_id
393,462,2018-09-07,https://www.nrl.com/draw/nrl-premiership/2018/...,7,13
394,463,2018-09-08,https://www.nrl.com/draw/nrl-premiership/2018/...,12,9
395,464,2018-09-08,https://www.nrl.com/draw/nrl-premiership/2018/...,15,4
396,465,2018-09-09,https://www.nrl.com/draw/nrl-premiership/2018/...,1,14
397,466,2018-09-14,https://www.nrl.com/draw/nrl-premiership/2018/...,4,12
398,467,2018-09-15,https://www.nrl.com/draw/nrl-premiership/2018/...,13,14
399,468,2018-09-21,https://www.nrl.com/draw/nrl-premiership/2018/...,7,4
400,469,2018-09-22,https://www.nrl.com/draw/nrl-premiership/2018/...,15,13
401,470,2018-09-30,https://www.nrl.com/draw/nrl-premiership/2018/...,15,7


In [52]:
player_match_stats = dict.fromkeys(scraping_dict.keys(), {})
print(player_match_stats)
for year in player_match_stats.keys():
    print(year)

{2019: {}, 2018: {}, 2017: {}, 2016: {}, 2015: {}, 2014: {}, 2013: {}}
2019
2018
2017
2016
2015
2014
2013


In [55]:
for year in [2018, 2019]:
    #print(player_match_stats[year])
    year_dict = {}
    for match in scraping_dict[year].iterrows():
        match = match[1]
        #print(match)
        #print(year)
        match_key = match['url'].split(str(year) + '/')[1][:-1]
        for char in ['-vs-', '-v-', '/', '-']:
            match_key = match_key.replace(char, '_')
        #print(match_key)
        #print('Add ' + match_key + ' to ' + str(year))
        year_dict[match_key] = {}
    print(year_dict)
    player_match_stats[year] = year_dict
player_match_stats[2019]

{'finals_week_1_storm_rabbitohs': {}, 'finals_week_1_panthers_warriors': {}, 'finals_week_1_roosters_sharks': {}, 'finals_week_1_broncos_dragons': {}, 'finals_week_2_sharks_panthers': {}, 'finals_week_2_rabbitohs_dragons': {}, 'finals_week_3_storm_sharks': {}, 'finals_week_3_roosters_rabbitohs': {}, 'grand_final_roosters_storm': {}}
{'round_9_titans_sharks': {}, 'round_9_wests_tigers_panthers': {}, 'round_9_sea_eagles_broncos': {}, 'round_9_bulldogs_knights': {}, 'round_9_warriors_dragons': {}, 'round_9_storm_eels': {}, 'round_9_roosters_raiders': {}, 'round_9_rabbitohs_cowboys': {}, 'round_10_storm_wests_tigers': {}, 'round_10_panthers_warriors': {}, 'round_10_broncos_roosters': {}, 'round_10_titans_bulldogs': {}, 'round_10_cowboys_eels': {}, 'round_10_raiders_rabbitohs': {}, 'round_10_dragons_knights': {}, 'round_10_sharks_sea_eagles': {}, 'round_11_eels_panthers': {}, 'round_11_sea_eagles_titans': {}, 'round_11_knights_roosters': {}, 'round_11_raiders_cowboys': {}, 'round_11_warrior

{'round_9_titans_sharks': {},
 'round_9_wests_tigers_panthers': {},
 'round_9_sea_eagles_broncos': {},
 'round_9_bulldogs_knights': {},
 'round_9_warriors_dragons': {},
 'round_9_storm_eels': {},
 'round_9_roosters_raiders': {},
 'round_9_rabbitohs_cowboys': {},
 'round_10_storm_wests_tigers': {},
 'round_10_panthers_warriors': {},
 'round_10_broncos_roosters': {},
 'round_10_titans_bulldogs': {},
 'round_10_cowboys_eels': {},
 'round_10_raiders_rabbitohs': {},
 'round_10_dragons_knights': {},
 'round_10_sharks_sea_eagles': {},
 'round_11_eels_panthers': {},
 'round_11_sea_eagles_titans': {},
 'round_11_knights_roosters': {},
 'round_11_raiders_cowboys': {},
 'round_11_warriors_broncos': {},
 'round_11_rabbitohs_wests_tigers': {},
 'round_11_bulldogs_storm': {},
 'round_11_dragons_sharks': {},
 'round_12_panthers_sea_eagles': {},
 'round_12_eels_rabbitohs': {},
 'round_12_bulldogs_raiders': {},
 'round_12_titans_cowboys': {},
 'round_13_rabbitohs_knights': {},
 'round_13_wests_tigers_r

In [None]:
for year in player_match_stats.keys():
    for match in scraping_dict[year].iterrows():
        match = match[1]
        player_match_stats[year][match['url']] = {}
    
#for match in scraping_dict[year].iterrows():
for match in scraping_dict[2017].iterrows():
    player_match_stats = {}
    
    
    match = match[1]
    try:
        print(match['url'])
        driver.get(match['url'])
        
        #home_xpath_div = '1', away_xpath_div = '2'
        for xpath in ['1', '2']:
            wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody')))
            for i in range(1, 18):
                name_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr['+ str(i) +']/td[2]/a').get_attribute('innerText').strip()
                first_name = name_field.split(' ')[0].strip().capitalize()
                last_name = name_field.split(' ')[-1].strip().capitalize()
                middle_name = name_field.split(' ')[-2].strip()
                print(middle_name)
                if middle_name.isalpha():
                    last_name = middle_name.capitalize() + ' ' + last_name
                if xpath == '1':
                    team_id = match['home_team_id']
                elif xpath == '2':
                    team_id = match['away_team_id']
                full_name = first_name + '_' + last_name + '_' + str(team_id)
                player_id = find_or_create_player(first_name, last_name, str(team+id))
                print(player_id)
                
                player_match_stats[full_name] = []
                    player_match_stats[full_name].append(player_id)
                    player_match_stats[full_name].append(team_id)
                    player_match_stats[full_name].append(match_id)

                    #column = 3
                    while column in range(3,67):
                        if column in [5, 7, 15, 17, 21, 34, 40, 47, 56, 64]:
                            #column += 1
                            continue
                        else:
                            stat_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr[' + str(i) + ']/td[' + str(column) + ']')
                            player_match_stats[full_name].append(stat_field.get_attribute('innerText').strip())
                            #column += 1
                    print(player_match_stats[full_name])
    except:
        print('error: ' + match['url'])
        
#scraping_dict[2017]


Checking for mac64 chromedriver:80.0.3987.106 in cache
There is no cached driver. Downloading new one...
Trying to download new driver from http://chromedriver.storage.googleapis.com/80.0.3987.106/chromedriver_mac64.zip
Unpack archive /Users/nickpowers/.wdm/chromedriver/80.0.3987.106/mac64/chromedriver.zip
