In [1]:
#Scraping Imports
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
#SQL Imports
import mysql.connector
#Pandas imports
import pandas as pd

In [3]:
column_names = ['player_id', 'team_id', 'match_id', 'number', 'position', 'minutes_played', 'points', 'tries',
                'conversions','conversion_attempts', 'penalty_goals', 'conversion_percentage','field_goals',
                'fantasy_points', 'total_runs', 'total_run_metres', 'kick_return_metres', 'post_contact_metres',
                'line_breaks', 'line_break_assists', 'try_assists', 'line_engaged_runs', 'tackle_breaks', 'hit_ups',
                'play_the_ball', 'average_play_the_ball_seconds', 'dummy_half_runs', 'dummy_half_run_metres', 
                'steals', 'offloads', 'dummy_passes', 'passes', 'receipts', 'pass_to_run_ratio', 'tackle_percentage',
                'tackles_made', 'tackles_missed', 'ineffective_tackles', 'intercepts', 'kicks_defused', 'kicks',
                'kicking_metres', 'forced_drop_outs', 'bomb_kicks', 'grubbers', 'fourty_twenty',
                'cross_field_kicks', 'kicked_dead', 'errors', 'handling_errors', 'one_on_ones_lost', 'penalties',
                'on_report', 'sin_bins', 'send_offs', 'stint_one', 'stint_two']

In [4]:
#DB Connection
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  passwd="NYg1@nts",
  database="NRL_data"
)
mycursor = mydb.cursor(buffered=True)

In [5]:
#Find team and player ids
def find_team_id(name):
    find_team_query = 'SELECT id FROM Teams WHERE nickname = %s;'
    mycursor.execute(find_team_query, (name,))
    return mycursor.fetchone()[0]

def find_position_id(name):
    find_position_query = 'SELECT id FROM Positions WHERE position_name = %s;'
    mycursor.execute(find_position_query, (name,))
    return mycursor.fetchone()[0]

In [6]:
def find_or_create_player(first_name, last_name, team_id):
    find_player_query = 'SELECT id FROM Players WHERE first_name = %s AND last_name LIKE %s AND current_team = %s LIMIT 1;'
    mycursor.execute(find_player_query, (first_name, '%' + last_name + '%', team_id))
    result = mycursor.fetchone()
    if result is None:
        insert_player_query = 'INSERT INTO Players (first_name, last_name, current_team) VALUES (%s, %s, %s);'
        data = (first_name, last_name, team_id)
        print(data)
        #mycursor.execute(insert_player_query, data)
        #mydb.commit()
        #result = find_or_create_player(first_name, last_name, team_id)
        #return int(result)
    else:
        result = result[0]
        print(result)
        #return int(result)

In [7]:
#1. Get URLs that need to be scraped
###errors = {'url': []}
#Find matches that were already scraped
already_scraped = 'SELECT DISTINCT match_id FROM PlayerMatchStats;'
mycursor.execute(already_scraped,)
results = mycursor.fetchall()
already_scraped_list = list(map(lambda x: x[0], results))

#Find all matches
all_matches_query = pd.read_sql_query('SELECT id, date, url, home_team_id, away_team_id FROM Matches;', mydb)
all_match_df = pd.DataFrame(all_matches_query, columns=['id', 'date', 'url', 'home_team_id', 'away_team_id'])

#Remove matches which were already scraped
not_yet_scraped = set(list(all_match_df['id'])) - set(already_scraped_list)
not_yet_scraped_df = all_match_df[all_match_df['id'].isin(not_yet_scraped)]
#print(not_yet_scraped_df)

In [22]:
#2. Set Up WebDriver
driver = webdriver.Chrome(ChromeDriverManager().install())
wait = WebDriverWait(driver, 10)


Checking for mac64 chromedriver:80.0.3987.106 in cache
Driver found in /Users/nickpowers/.wdm/chromedriver/80.0.3987.106/mac64/chromedriver


In [23]:
not_yet_scraped_df['date'] = pd.to_datetime(not_yet_scraped_df['date'])

scraping_dict = {}
for year in list(not_yet_scraped_df['date'].dt.year.unique()):
    scraping_dict[year] = not_yet_scraped_df[not_yet_scraped_df['date'].dt.year == year]
scraping_dict[2017]
    
#for match in scraping_dict[year].iterrows():
for match in scraping_dict[2017].iterrows():
    player_match_stats = {}
    
    
    match = match[1]
    try:
        print(match['url'])
        driver.get(match['url'])
        
        #home_xpath_div = '1', away_xpath_div = '2'
        for xpath in ['1', '2']:
            wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody')))
            for i in range(1, 18):
                name_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr['+ str(i) +']/td[2]/a').get_attribute('innerText').strip()
                first_name = name_field.split(' ')[0].strip().capitalize()
                last_name = name_field.split(' ')[-1].strip().capitalize()
                middle_name = name_field.split(' ')[-2].strip()
                print(middle_name)
                if middle_name.isalpha():
                    last_name = middle_name.capitalize() + ' ' + last_name
                if xpath == '1':
                    team_id = match['home_team_id']
                elif xpath == '2':
                    team_id = match['away_team_id']
                full_name = first_name + '_' + last_name + '_' + str(team_id)
                player_id = find_or_create_player(first_name, last_name, str(team+id))
                print(player_id)
                
                player_match_stats[full_name] = []
                    player_match_stats[full_name].append(player_id)
                    player_match_stats[full_name].append(team_id)
                    player_match_stats[full_name].append(match_id)

                    #column = 3
                    while column in range(3,67):
                        if column in [5, 7, 15, 17, 21, 34, 40, 47, 56, 64]:
                            #column += 1
                            continue
                        else:
                            stat_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr[' + str(i) + ']/td[' + str(column) + ']')
                            player_match_stats[full_name].append(stat_field.get_attribute('innerText').strip())
                            #column += 1
                    print(player_match_stats[full_name])
    except:
        print('error: ' + match['url'])
        
#scraping_dict[2017]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


https://www.nrl.com/draw/nrl-premiership/2017/round-1/sharks-vs-broncos/

4
Gerard_Beale_4
('Gerard', 'Beale', '4')
None

4
Sosaia_Feki_4
97
None

4
Jack_Bird_4
('Jack', 'Bird', '4')
None

4
Ricky_Leutele_4
471
None

4
Edrick_Lee_4
('Edrick', 'Lee', '4')
None

4
James_Maloney_4
('James', 'Maloney', '4')
None

4
Chad_Townsend_4
89
None

4
Andrew_Fifita_4
103
None

4
Jayden_Brailey_4
473
None

4
Matt_Prior_4
474
None

4
Luke_Lewis_4
582
None

4
Wade_Graham_4
476
None

4
Paul_Gallen_4
477
None

4
Jayson_Bukuya_4
475
None

4
Chris_Heighington_4
('Chris', 'Heighington', '4')
None

4
Sam_Tagataese_4
('Sam', 'Tagataese', '4')
None

4
Fa'amanu_Brown_4
("Fa'amanu", 'Brown', '4')
None

1
Darius_Boyd_1
22
None

1
Corey_Oates_1
20
None

1
James_Roberts_1
27
None

1
Tautau_Moga_1
('Tautau', 'Moga', '1')
None

1
Jordan_Kahu_1
('Jordan', 'Kahu', '1')
None

1
Anthony_Milford_1
19
None

1
Ben_Hunt_1
('Ben', 'Hunt', '1')
None

1
Korbin_Sims_1
('Korbin', 'Sims', '1')
None

1
Andrew_Mccullough_1
3
None

1

6
Tom_Trbojevic_6
151
None

6
Jorge_Taufua_6
145
None

6
Dylan_Walker_6
143
None

6
Brian_Kelly_6
('Brian', 'Kelly', '6')
None

6
Akuila_Uate_6
508
None

6
Blake_Green_6
('Blake', 'Green', '6')
None

6
Daly_Cherry-evans_6
142
None

6
Brenton_Lawrence_6
('Brenton', 'Lawrence', '6')
None

6
Apisai_Koroisau_6
153
None

6
Martin_Taupau_6
510
None

6
Frank_Winterstein_6
('Frank', 'Winterstein', '6')
None

6
Curtis_Sironen_6
155
None

6
Jake_Trbojevic_6
512
None

6
Lewis_Brown_6
513
None

6
Nate_Myles_6
('Nate', 'Myles', '6')
None

6
Addin_Fonua-blake_6
509
None

6
Lloyd_Perrett_6
571
None

11
Bevan_French_11
279
None

11
Semi_Radradra_11
('Semi', 'Radradra', '11')
None

11
Michael_Jennings_11
285
None

11
Brad_Takairangi_11
293
None

11
Josh_Hoffman_11
284
None

11
Clinton_Gutherson_11
280
None

11
Corey_Norman_11
550
None

11
Suaia_Matagi_11
553
None

11
Kaysa_Pritchard_11
297
None

11
Tim_Mannah_11
306
None

11
Manu_Ma'u_11
298
None

11
Tepai_Moeroa_11
305
None

11
Beau_Scott_11
552
None


https://www.nrl.com/draw/nrl-premiership/2017/round-6/titans-vs-raiders/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-6/titans-vs-raiders/
https://www.nrl.com/draw/nrl-premiership/2017/round-6/cowboys-vs-wests-tigers/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-6/cowboys-vs-wests-tigers/
https://www.nrl.com/draw/nrl-premiership/2017/round-6/warriors-vs-eels/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-6/warriors-vs-eels/
https://www.nrl.com/draw/nrl-premiership/2017/round-6/storm-vs-sharks/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-6/storm-vs-sharks/
https://www.nrl.com/draw/nrl-premiership/2017/round-7/bulldogs-vs-rabbitohs/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-7/bulldogs-vs-rabbitohs/
https://www.nrl.com/draw/nrl-premiership/2017/round-7/knights-vs-roosters/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-7/knights-vs-roosters/
https://www.nrl.com/draw/nrl-premiership/2017/round-7/broncos-vs

https://www.nrl.com/draw/nrl-premiership/2017/round-13/cowboys-vs-titans/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-13/cowboys-vs-titans/
https://www.nrl.com/draw/nrl-premiership/2017/round-13/sea-eagles-vs-raiders/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-13/sea-eagles-vs-raiders/
https://www.nrl.com/draw/nrl-premiership/2017/round-13/bulldogs-vs-panthers/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-13/bulldogs-vs-panthers/
https://www.nrl.com/draw/nrl-premiership/2017/round-14/sharks-vs-storm/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-14/sharks-vs-storm/
https://www.nrl.com/draw/nrl-premiership/2017/round-14/sea-eagles-vs-knights/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-14/sea-eagles-vs-knights/
https://www.nrl.com/draw/nrl-premiership/2017/round-14/broncos-vs-rabbitohs/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-14/broncos-vs-rabbitohs/
https://www.nrl.com/draw/nrl-premiership/2017/

error: https://www.nrl.com/draw/nrl-premiership/2017/round-21/knights-vs-dragons/
https://www.nrl.com/draw/nrl-premiership/2017/round-21/rabbitohs-vs-raiders/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-21/rabbitohs-vs-raiders/
https://www.nrl.com/draw/nrl-premiership/2017/round-21/roosters-vs-cowboys/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-21/roosters-vs-cowboys/
https://www.nrl.com/draw/nrl-premiership/2017/round-21/storm-vs-sea-eagles/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-21/storm-vs-sea-eagles/
https://www.nrl.com/draw/nrl-premiership/2017/round-21/titans-vs-wests-tigers/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-21/titans-vs-wests-tigers/
https://www.nrl.com/draw/nrl-premiership/2017/round-22/bulldogs-vs-eels/
error: https://www.nrl.com/draw/nrl-premiership/2017/round-22/bulldogs-vs-eels/
https://www.nrl.com/draw/nrl-premiership/2017/round-22/cowboys-vs-storm/
error: https://www.nrl.com/draw/nrl-premiership/

error: https://www.nrl.com/draw/nrl-premiership/2017/finals-week-3/storm-vs-broncos/
https://www.nrl.com/draw/nrl-premiership/2017/finals-week-3/roosters-vs-cowboys/
error: https://www.nrl.com/draw/nrl-premiership/2017/finals-week-3/roosters-vs-cowboys/
https://www.nrl.com/draw/nrl-premiership/2017/grand-final/storm-vs-cowboys/
error: https://www.nrl.com/draw/nrl-premiership/2017/grand-final/storm-vs-cowboys/



Checking for mac64 chromedriver:80.0.3987.106 in cache
There is no cached driver. Downloading new one...
Trying to download new driver from http://chromedriver.storage.googleapis.com/80.0.3987.106/chromedriver_mac64.zip
Unpack archive /Users/nickpowers/.wdm/chromedriver/80.0.3987.106/mac64/chromedriver.zip
