In [83]:
#Scraping Imports
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager

In [84]:
#SQL Imports
import mysql.connector
#Pandas imports
import pandas as pd

In [85]:
column_names = ['player_id', 'team_id', 'match_id', 'number', 'position', 'minutes_played', 'points', 'tries',
                'conversions','conversion_attempts', 'penalty_goals', 'conversion_percentage','field_goals',
                'fantasy_points', 'total_runs', 'total_run_metres', 'kick_return_metres', 'post_contact_metres',
                'line_breaks', 'line_break_assists', 'try_assists', 'line_engaged_runs', 'tackle_breaks', 'hit_ups',
                'play_the_ball', 'average_play_the_ball_seconds', 'dummy_half_runs', 'dummy_half_run_metres', 
                'steals', 'offloads', 'dummy_passes', 'passes', 'receipts', 'pass_to_run_ratio', 'tackle_percentage',
                'tackles_made', 'tackles_missed', 'ineffective_tackles', 'intercepts', 'kicks_defused', 'kicks',
                'kicking_metres', 'forced_drop_outs', 'bomb_kicks', 'grubbers', 'fourty_twenty',
                'cross_field_kicks', 'kicked_dead', 'errors', 'handling_errors', 'one_on_ones_lost', 'penalties',
                'on_report', 'sin_bins', 'send_offs', 'stint_one', 'stint_two']

In [86]:
#DB Connection
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  passwd="NYg1@nts",
  database="NRL_data"
)
mycursor = mydb.cursor(buffered=True)

In [87]:
#Find team and player ids
def find_team_id(name):
    find_team_query = 'SELECT id FROM Teams WHERE nickname = %s;'
    mycursor.execute(find_team_query, (name,))
    return mycursor.fetchone()[0]

def find_position_id(name):
    find_position_query = 'SELECT id FROM Positions WHERE position_name = %s;'
    mycursor.execute(find_position_query, (name,))
    return mycursor.fetchone()[0]

In [88]:
def find_or_create_player(first_name, last_name, team_id):
    find_player_query = 'SELECT id FROM Players WHERE first_name = %s AND last_name LIKE %s AND current_team = %s LIMIT 1;'
    mycursor.execute(find_player_query, (first_name, '%' + last_name + '%', team_id))
    result = mycursor.fetchone()
    if result is None:
        insert_player_query = 'INSERT INTO Players (first_name, last_name, current_team) VALUES (%s, %s, %s);'
        data = (first_name, last_name, team_id)
        print(data)
        #mycursor.execute(insert_player_query, data)
        #mydb.commit()
        #result = find_or_create_player(first_name, last_name, team_id)
        #return int(result)
    else:
        result = result[0]
        #print('id = ' + str(result))
        return result

In [89]:
#1. Get URLs that need to be scraped
###errors = {'url': []}
#Find matches that were already scraped
already_scraped = 'SELECT DISTINCT match_id FROM PlayerMatchStats;'
mycursor.execute(already_scraped,)
results = mycursor.fetchall()
already_scraped_list = list(map(lambda x: x[0], results))

#Find all matches
all_matches_query = pd.read_sql_query('SELECT id, date, url, home_team_id, away_team_id FROM Matches;', mydb)
all_match_df = pd.DataFrame(all_matches_query, columns=['id', 'date', 'url', 'home_team_id', 'away_team_id'])

#Remove matches which were already scraped
not_yet_scraped = set(list(all_match_df['id'])) - set(already_scraped_list)
not_yet_scraped_df = all_match_df[all_match_df['id'].isin(not_yet_scraped)]
#print(not_yet_scraped_df)

In [106]:
#2. Set Up WebDriver
driver = webdriver.Chrome(ChromeDriverManager().install())
wait = WebDriverWait(driver, 10)


Checking for mac64 chromedriver:80.0.3987.106 in cache
Driver found in /Users/nickpowers/.wdm/chromedriver/80.0.3987.106/mac64/chromedriver


In [107]:
not_yet_scraped_df['date'] = pd.to_datetime(not_yet_scraped_df['date'])

scraping_dict = {}
for year in list(not_yet_scraped_df['date'].dt.year.unique()):
    scraping_dict[year] = not_yet_scraped_df[not_yet_scraped_df['date'].dt.year == year]
scraping_dict[2018]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,id,date,url,home_team_id,away_team_id
393,462,2018-09-07,https://www.nrl.com/draw/nrl-premiership/2018/...,7,13
394,463,2018-09-08,https://www.nrl.com/draw/nrl-premiership/2018/...,12,9
395,464,2018-09-08,https://www.nrl.com/draw/nrl-premiership/2018/...,15,4
396,465,2018-09-09,https://www.nrl.com/draw/nrl-premiership/2018/...,1,14
397,466,2018-09-14,https://www.nrl.com/draw/nrl-premiership/2018/...,4,12
398,467,2018-09-15,https://www.nrl.com/draw/nrl-premiership/2018/...,13,14
399,468,2018-09-21,https://www.nrl.com/draw/nrl-premiership/2018/...,7,4
400,469,2018-09-22,https://www.nrl.com/draw/nrl-premiership/2018/...,15,13
401,470,2018-09-30,https://www.nrl.com/draw/nrl-premiership/2018/...,15,7


In [108]:
player_match_stats = dict.fromkeys(scraping_dict.keys(), {})

In [109]:
for year in [2018]:
#for year in player_match_stats.keys():
    year_dict = {}
    year_dict['errors'] = []
    for match in scraping_dict[year].iterrows():
        match = match[1]
        match_key = match['url'].split(str(year) + '/')[1][:-1]
        for char in ['-vs-', '-v-', '/', '-']:
            match_key = match_key.replace(char, '_')
        year_dict[match_key] = {}
        
        try:
            print(match['url'])
            driver.get(match['url'])
        
            #home_xpath_div = '1', away_xpath_div = '2'
            for xpath in ['1', '2']:
                wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody')))
                for i in range(1, 18):
                    name_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr['+ str(i) +']/td[2]/a').get_attribute('innerText').strip()
                    first_name = name_field.split(' ')[0].strip().capitalize()
                    last_name = name_field.split(' ')[-1].strip().capitalize()
                    middle_name = name_field.split(' ')[-2].strip()
                    #print(middle_name)
                    if middle_name.isalpha():
                        last_name = middle_name.capitalize() + ' ' + last_name
                    if xpath == '1':
                        team_id = match['home_team_id']
                    elif xpath == '2':
                        team_id = match['away_team_id']
                    full_name = first_name + '_' + last_name + '_' + str(team_id)
                    player_id = find_or_create_player(first_name, last_name, str(team_id))
                    
                    player_stat_list = []
                    player_stat_list.append(player_id)
                    player_stat_list.append(team_id)
                    player_stat_list.append(match['id'])
                    
                    for column in range(3, 67):
                        if column in [5, 7, 15, 17, 21, 34, 40, 47, 56, 64]:
                            continue
                        else:
                            stat_field = driver.find_element_by_xpath('//*[@id="player-stats"]/div[' + xpath + ']/div/div[3]/div/table/tbody/tr[' + str(i) + ']/td[' + str(column) + ']')
                            player_stat_list.append(stat_field.get_attribute('innerText').strip())
                    print(player_stat_list)
                    year_dict[match_key][full_name] = player_stat_list
        except:
            print('error: ' + match['url'])
            year_dict['errors'].append(match['url'])
        print(year_dict[match_key])
    
    csv_data = pd.DataFrame.from_dict(year_dict, orient='index').replace('-', 0)
    #csv_data = pd.DataFrame.from_dict(year_dict, orient='index', columns=column_names).replace('-', 0)
    csv_data = csv_data.replace({pd.np.nan: 0})
#         #clean csv_data
#     for column in ['conversion_percentage', 'tackle_percentage']:
#         csv_data[column] = csv_data[column].str.replace('%', '').astype('float') / 100
#         csv_data[column] = csv_data[column].round(3)

#     for column in ['minutes_played', 'stint_one', 'stint_two']:
#         csv_data[column] = csv_data[column].str.replace(':', '.').astype('float')

#     csv_data['average_play_the_ball_seconds'] = csv_data['average_play_the_ball_seconds'].str.replace('s', '').astype('float')
#             #csv_data['minutes_played'] = csv_data['minutes_played'].str.replace(':00', '').astype(int)
#     csv_data['position_id'] = csv_data['position'].apply(find_position_id)

    print(csv_data)
    print(year_dict)
    player_match_stats[year] = year_dict

https://www.nrl.com/draw/nrl-premiership/2018/finals-week-1/storm-vs-rabbitohs/
[568, 7, 462, '1', 'Fullback', '80:00', '-', '-', '-', '-', '-', '-', '-', '28', '13', '105', '50', '38', '1', '1', '1', '-', '2', '-', '11', '3.19s', '3', '15', '-', '1', '-', '21', '36', '1.62', '33.3%', '2', '2', '2', '-', '3', '-', '-', '-', '-', '-', '-', '-', '-', '1', '-', '-', '-', '-', '-', '-', '80:00', '-']
[185, 7, 462, '2', 'Winger', '80:00', '8', '2', '-', '-', '-', '-', '-', '26', '8', '89', '25', '29', '1', '-', '-', '-', '-', '-', '5', '4.39s', '-', '-', '-', '2', '-', '1', '13', '0.13', '57.1%', '4', '3', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '2', '-', '-', '-', '-', '-', '-', '80:00', '-']
[623, 7, 462, '3', 'Centre', '80:00', '8', '2', '-', '-', '-', '-', '-', '51', '14', '118', '-', '27', '1', '1', '-', '-', '4', '1', '11', '3.03s', '-', '-', '-', '-', '-', '1', '16', '0.07', '83.3%', '10', '2', '-', '-', '-', '1', '17', '-', '-', '1', '-', '-', '-', '-', '-', '-', '-',

[361, 13, 462, '9', 'Hooker', '80:00', '-', '-', '-', '-', '-', '-', '-', '42', '8', '66', '-', '21', '-', '-', '-', '-', '2', '1', '6', '3.73s', '7', '57', '-', '-', '3', '98', '104', '12.25', '89.7%', '35', '3', '1', '-', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80:00', '-']
[363, 13, 462, '10', 'Prop', '43:00', '-', '-', '-', '-', '-', '-', '-', '23', '9', '92', '50', '24', '-', '-', '-', '-', '1', '5', '9', '3.59s', '-', '-', '-', '-', '-', '-', '9', '-', '89.7%', '26', '2', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '3', '-', '1', '-', '17:18', '31:12']
[364, 13, 462, '11', '2nd Row', '80:00', '-', '-', '-', '-', '-', '-', '-', '35', '16', '154', '-', '71', '-', '-', '-', '-', '2', '15', '14', '3.14s', '-', '-', '-', '1', '1', '8', '23', '0.5', '75.9%', '22', '4', '3', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '1', '1', '-', '-', '-', '-', '-', '80:00', '-']
[529, 13, 462, '12', '2nd Row', '70:00', '-', '-'

{'Billy_Slater_7': [568, 7, 462, '1', 'Fullback', '80:00', '-', '-', '-', '-', '-', '-', '-', '28', '13', '105', '50', '38', '1', '1', '1', '-', '2', '-', '11', '3.19s', '3', '15', '-', '1', '-', '21', '36', '1.62', '33.3%', '2', '2', '2', '-', '3', '-', '-', '-', '-', '-', '-', '-', '-', '1', '-', '-', '-', '-', '-', '-', '80:00', '-'], 'Suliasi_Vunivalu_7': [185, 7, 462, '2', 'Winger', '80:00', '8', '2', '-', '-', '-', '-', '-', '26', '8', '89', '25', '29', '1', '-', '-', '-', '-', '-', '5', '4.39s', '-', '-', '-', '2', '-', '1', '13', '0.13', '57.1%', '4', '3', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '2', '-', '-', '-', '-', '-', '-', '80:00', '-'], 'Cheyse_Blair_7': [623, 7, 462, '3', 'Centre', '80:00', '8', '2', '-', '-', '-', '-', '-', '51', '14', '118', '-', '27', '1', '1', '-', '-', '4', '1', '11', '3.03s', '-', '-', '-', '-', '-', '1', '16', '0.07', '83.3%', '10', '2', '-', '-', '-', '1', '17', '-', '-', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80

https://www.nrl.com/draw/nrl-premiership/2018/finals-week-1/panthers-vs-warriors/
[316, 12, 463, '1', 'Fullback', '80:00', '-', '-', '-', '-', '-', '-', '-', '55', '20', '189', '77', '44', '1', '-', '1', '-', '7', '-', '15', '3.62s', '-', '-', '-', '1', '-', '12', '29', '0.6', '66.7%', '4', '1', '1', '-', '3', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80:00', '-']
[323, 12, 463, '2', 'Winger', '80:00', '-', '-', '-', '-', '-', '-', '-', '32', '21', '180', '37', '70', '-', '-', '-', '-', '2', '1', '18', '3.34s', '3', '21', '-', '-', '-', '6', '24', '0.29', '80%', '4', '-', '1', '-', '2', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80:00', '-']
[326, 12, 463, '3', 'Centre', '80:00', '-', '-', '-', '-', '-', '-', '-', '10', '5', '41', '-', '5', '-', '-', '-', '-', '3', '-', '4', '2.93s', '2', '22', '-', '-', '-', '5', '8', '1', '52.9%', '9', '6', '2', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '1', '-'

[225, 9, 463, '9', 'Hooker', '80:00', '4', '1', '-', '-', '-', '-', '-', '54', '3', '31', '-', '6', '1', '-', '-', '-', '4', '-', '2', '4.56s', '3', '31', '-', '-', '-', '77', '80', '25.67', '85%', '34', '5', '1', '-', '1', '2', '55', '1', '-', '1', '-', '-', '-', '-', '-', '-', '1', '-', '-', '-', '80:00', '-']
[221, 9, 463, '10', 'Prop', '48:00', '-', '-', '-', '-', '-', '-', '-', '41', '9', '109', '53', '41', '-', '-', '-', '-', '4', '6', '7', '3.88s', '-', '-', '-', '2', '1', '-', '9', '-', '90.5%', '19', '1', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '1', '-', '-', '-', '29:40', '18:05']
[220, 9, 463, '11', '2nd Row', '59:00', '-', '-', '-', '-', '-', '-', '-', '34', '6', '49', '-', '19', '-', '-', '-', '-', '-', '6', '4', '2.98s', '-', '-', '-', '1', '-', '4', '10', '0.67', '97%', '32', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '1', '1', '-', '-', '-', '-', '-', '58:44', '-']
[234, 9, 463, '12', '2nd Row', '80:00', '-', '-', '-', '-',

{'Dallin_Watene-zelezniak_12': [316, 12, 463, '1', 'Fullback', '80:00', '-', '-', '-', '-', '-', '-', '-', '55', '20', '189', '77', '44', '1', '-', '1', '-', '7', '-', '15', '3.62s', '-', '-', '-', '1', '-', '12', '29', '0.6', '66.7%', '4', '1', '1', '-', '3', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80:00', '-'], 'Josh_Mansour_12': [323, 12, 463, '2', 'Winger', '80:00', '-', '-', '-', '-', '-', '-', '-', '32', '21', '180', '37', '70', '-', '-', '-', '-', '2', '1', '18', '3.34s', '3', '21', '-', '-', '-', '6', '24', '0.29', '80%', '4', '-', '1', '-', '2', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80:00', '-'], 'Waqa_Blake_12': [326, 12, 463, '3', 'Centre', '80:00', '-', '-', '-', '-', '-', '-', '-', '10', '5', '41', '-', '5', '-', '-', '-', '-', '3', '-', '4', '2.93s', '2', '22', '-', '-', '-', '5', '8', '1', '52.9%', '9', '6', '2', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '1', '-', '-', '-', '

https://www.nrl.com/draw/nrl-premiership/2018/finals-week-1/roosters-vs-sharks/
[432, 15, 464, '1', 'Fullback', '80:00', '-', '-', '-', '-', '-', '-', '-', '67', '23', '240', '76', '76', '-', '3', '2', '-', '8', '1', '22', '3.59s', '5', '47', '-', '-', '-', '8', '32', '0.35', '-', '-', '1', '-', '-', '5', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80:00', '-']
[431, 15, 464, '2', 'Winger', '80:00', '4', '1', '-', '-', '-', '-', '-', '44', '21', '191', '38', '78', '1', '-', '1', '-', '3', '1', '21', '2.7s', '1', '11', '-', '-', '-', '6', '30', '0.29', '100%', '1', '-', '-', '-', '2', '-', '-', '-', '-', '-', '-', '-', '-', '2', '-', '-', '-', '-', '-', '-', '80:00', '-']
[486, 15, 464, '3', 'Centre', '80:00', '8', '1', '1', '4', '1', '40%', '-', '50', '10', '77', '-', '33', '-', '-', '-', '-', '6', '-', '7', '3.33s', '1', '5', '-', '-', '-', '1', '12', '0.1', '92.3%', '12', '-', '1', '-', '-', '1', '48', '-', '-', '-', '-', '-', '-', '-', '-', '-', '3', 

[473, 4, 464, '9', 'Hooker', '31:00', '-', '-', '-', '-', '-', '-', '-', '21', '1', '9', '-', '4', '-', '-', '-', '-', '-', '-', '1', '2.84s', '1', '9', '-', '-', '1', '35', '37', '35', '100%', '21', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '30:37', '-']
[474, 4, 464, '10', 'Prop', '33:00', '-', '-', '-', '-', '-', '-', '-', '27', '2', '17', '-', '8', '-', '-', '-', '-', '-', '2', '2', '3.12s', '-', '-', '-', '-', '-', '-', '2', '-', '96.6%', '28', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '20:17', '13:08']
[582, 4, 464, '11', '2nd Row', '80:00', '-', '-', '-', '-', '-', '-', '-', '37', '15', '151', '-', '64', '-', '-', '-', '-', '1', '11', '12', '3.41s', '4', '45', '-', '1', '-', '6', '20', '0.4', '87.9%', '29', '4', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '1', '1', '-', '1', '-', '-', '-', '80:00', '-']
[476, 4, 464, '12', '2nd Row', '13:00', '-', '-', '-', '-', '

{'James_Tedesco_15': [432, 15, 464, '1', 'Fullback', '80:00', '-', '-', '-', '-', '-', '-', '-', '67', '23', '240', '76', '76', '-', '3', '2', '-', '8', '1', '22', '3.59s', '5', '47', '-', '-', '-', '8', '32', '0.35', '-', '-', '1', '-', '-', '5', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80:00', '-'], 'Daniel_Tupou_15': [431, 15, 464, '2', 'Winger', '80:00', '4', '1', '-', '-', '-', '-', '-', '44', '21', '191', '38', '78', '1', '-', '1', '-', '3', '1', '21', '2.7s', '1', '11', '-', '-', '-', '6', '30', '0.29', '100%', '1', '-', '-', '-', '2', '-', '-', '-', '-', '-', '-', '-', '-', '2', '-', '-', '-', '-', '-', '-', '80:00', '-'], 'Latrell_Mitchell_15': [486, 15, 464, '3', 'Centre', '80:00', '8', '1', '1', '4', '1', '40%', '-', '50', '10', '77', '-', '33', '-', '-', '-', '-', '6', '-', '7', '3.33s', '1', '5', '-', '-', '-', '1', '12', '0.1', '92.3%', '12', '-', '1', '-', '-', '1', '48', '-', '-', '-', '-', '-', '-', '-', '-', '-', '3', '-', '-', '-', 

https://www.nrl.com/draw/nrl-premiership/2018/finals-week-1/broncos-vs-dragons/
[22, 1, 465, '1', 'Fullback', '80:00', '4', '1', '-', '-', '-', '-', '-', '41', '12', '107', '40', '16', '1', '-', '-', '-', '6', '-', '4', '3.16s', '2', '23', '-', '-', '1', '20', '27', '1.67', '62.5%', '5', '2', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80:00', '-']
[20, 1, 465, '2', 'Winger', '80:00', '-', '-', '-', '-', '-', '-', '-', '15', '10', '94', '43', '47', '-', '-', '-', '-', '2', '-', '10', '2.84s', '-', '-', '-', '-', '-', '-', '12', '-', '80%', '4', '-', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '2', '1', '-', '-', '-', '-', '-', '80:00', '-']
[27, 1, 465, '3', 'Centre', '80:00', '-', '-', '-', '-', '-', '-', '-', '46', '8', '103', '-', '15', '-', '-', '-', '-', '8', '-', '7', '3.18s', '-', '-', '-', '-', '-', '1', '8', '0.13', '88.9%', '16', '2', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', 

[391, 14, 465, '9', 'Hooker', '80:00', '-', '-', '-', '-', '-', '-', '-', '48', '5', '37', '-', '13', '-', '2', '3', '-', '-', '3', '4', '3.67s', '1', '11', '-', '-', '-', '99', '105', '19.8', '92.3%', '36', '3', '-', '-', '-', '1', '7', '-', '-', '1', '-', '-', '-', '1', '1', '-', '1', '-', '-', '-', '80:00', '-']
('Leeson', 'Ah Mau', '14')
[None, 14, 465, '10', 'Prop', '63:00', '4', '1', '-', '-', '-', '-', '-', '63', '14', '147', '39', '50', '1', '-', '-', '-', '2', '11', '12', '3.56s', '-', '-', '-', '-', '-', '-', '14', '-', '100%', '31', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '31:16', '32:14']
[407, 14, 465, '11', '2nd Row', '68:00', '-', '-', '-', '-', '-', '-', '-', '55', '11', '135', '-', '48', '1', '-', '-', '-', '3', '11', '9', '2.75s', '-', '-', '-', '1', '-', '1', '13', '0.09', '100%', '29', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '1', '1', '-', '-', '-', '-', '-', '68:11', '-']
[406, 14, 465, '12'

{'Darius_Boyd_1': [22, 1, 465, '1', 'Fullback', '80:00', '4', '1', '-', '-', '-', '-', '-', '41', '12', '107', '40', '16', '1', '-', '-', '-', '6', '-', '4', '3.16s', '2', '23', '-', '-', '1', '20', '27', '1.67', '62.5%', '5', '2', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80:00', '-'], 'Corey_Oates_1': [20, 1, 465, '2', 'Winger', '80:00', '-', '-', '-', '-', '-', '-', '-', '15', '10', '94', '43', '47', '-', '-', '-', '-', '2', '-', '10', '2.84s', '-', '-', '-', '-', '-', '-', '12', '-', '80%', '4', '-', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '2', '1', '-', '-', '-', '-', '-', '80:00', '-'], 'James_Roberts_1': [27, 1, 465, '3', 'Centre', '80:00', '-', '-', '-', '-', '-', '-', '-', '46', '8', '103', '-', '15', '-', '-', '-', '-', '8', '-', '7', '3.18s', '-', '-', '-', '-', '-', '1', '8', '0.13', '88.9%', '16', '2', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80:00', '-'], 'Jor

https://www.nrl.com/draw/nrl-premiership/2018/finals-week-2/sharks-vs-panthers/
error: https://www.nrl.com/draw/nrl-premiership/2018/finals-week-2/sharks-vs-panthers/
{}
https://www.nrl.com/draw/nrl-premiership/2018/finals-week-2/rabbitohs-vs-dragons/
error: https://www.nrl.com/draw/nrl-premiership/2018/finals-week-2/rabbitohs-vs-dragons/
{}
https://www.nrl.com/draw/nrl-premiership/2018/finals-week-3/storm-vs-sharks/
error: https://www.nrl.com/draw/nrl-premiership/2018/finals-week-3/storm-vs-sharks/
{}
https://www.nrl.com/draw/nrl-premiership/2018/finals-week-3/roosters-vs-rabbitohs/
error: https://www.nrl.com/draw/nrl-premiership/2018/finals-week-3/roosters-vs-rabbitohs/
{}
https://www.nrl.com/draw/nrl-premiership/2018/grand-final/roosters-vs-storm/
[432, 15, 470, '1', 'Fullback', '80:00', '-', '-', '-', '-', '-', '-', '-', '37', '17', '214', '99', '53', '-', '1', '1', '-', '1', '-', '18', '3.45s', '1', '5', '-', '-', '2', '6', '25', '0.35', '100%', '7', '-', '-', '-', '1', '-', '-', 

[174, 7, 470, '7', 'Halfback', '80:00', '-', '-', '-', '-', '-', '-', '-', '46', '6', '42', '-', '14', '-', '-', '-', '-', '3', '-', '3', '3.03s', '-', '-', '-', '1', '3', '24', '36', '4', '94.4%', '17', '1', '-', '-', '-', '8', '242', '2', '-', '3', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80:00', '-']
[164, 7, 470, '8', 'Prop', '39:00', '-', '-', '-', '-', '-', '-', '-', '39', '8', '74', '-', '22', '-', '-', '-', '-', '1', '8', '7', '2.89s', '-', '-', '-', '1', '1', '2', '10', '0.25', '96.7%', '29', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '20:46', '18:17']
[159, 7, 470, '9', 'Hooker', '80:00', '2', '-', '1', '1', '-', '100%', '-', '59', '1', '7', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '1', '7', '-', '-', '5', '90', '85', '90', '94.6%', '53', '1', '2', '-', '-', '3', '163', '-', '-', '-', '-', '-', '1', '-', '-', '-', '1', '-', '-', '-', '80:00', '-']
('Tim', 'Glasby', '7')
[None, 7, 470, '10', 'Prop', '30:00'

{'James_Tedesco_15': [432, 15, 470, '1', 'Fullback', '80:00', '-', '-', '-', '-', '-', '-', '-', '37', '17', '214', '99', '53', '-', '1', '1', '-', '1', '-', '18', '3.45s', '1', '5', '-', '-', '2', '6', '25', '0.35', '100%', '7', '-', '-', '-', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '1', '-', '-', '-', '80:00', '-'], 'Daniel_Tupou_15': [431, 15, 470, '2', 'Winger', '80:00', '4', '1', '-', '-', '-', '-', '-', '44', '19', '184', '38', '37', '1', '-', '-', '-', '4', '-', '17', '3.2s', '1', '9', '-', '-', '-', '-', '20', '-', '75%', '3', '1', '-', '-', '1', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '80:00', '-'], 'Latrell_Mitchell_15': [486, 15, 470, '3', 'Centre', '80:00', '12', '1', '1', '3', '3', '66.67%', '-', '46', '10', '59', '-', '10', '1', '-', '-', '-', '4', '-', '6', '3.79s', '-', '-', '-', '1', '-', '7', '18', '0.7', '84.6%', '11', '2', '-', '-', '2', '1', '12', '-', '-', '-', '-', '-', '-', '-', '-', '-', '1', '-', '-', '-',

                                                                                 0   \
errors                            https://www.nrl.com/draw/nrl-premiership/2018/...   
finals_week_1_storm_rabbitohs                                        Billy_Slater_7   
finals_week_1_panthers_warriors                          Dallin_Watene-zelezniak_12   
finals_week_1_roosters_sharks                                      James_Tedesco_15   
finals_week_1_broncos_dragons                                         Darius_Boyd_1   
finals_week_2_sharks_panthers                                                     0   
finals_week_2_rabbitohs_dragons                                                   0   
finals_week_3_storm_sharks                                                        0   
finals_week_3_roosters_rabbitohs                                                  0   
grand_final_roosters_storm                                         James_Tedesco_15   

                                          

[10 rows x 34 columns]


{'errors': ['https://www.nrl.com/draw/nrl-premiership/2018/finals-week-2/sharks-vs-panthers/', 'https://www.nrl.com/draw/nrl-premiership/2018/finals-week-2/rabbitohs-vs-dragons/', 'https://www.nrl.com/draw/nrl-premiership/2018/finals-week-3/storm-vs-sharks/', 'https://www.nrl.com/draw/nrl-premiership/2018/finals-week-3/roosters-vs-rabbitohs/'], 'finals_week_1_storm_rabbitohs': {'Billy_Slater_7': [568, 7, 462, '1', 'Fullback', '80:00', '-', '-', '-', '-', '-', '-', '-', '28', '13', '105', '50', '38', '1', '1', '1', '-', '2', '-', '11', '3.19s', '3', '15', '-', '1', '-', '21', '36', '1.62', '33.3%', '2', '2', '2', '-', '3', '-', '-', '-', '-', '-', '-', '-', '-', '1', '-', '-', '-', '-', '-', '-', '80:00', '-'], 'Suliasi_Vunivalu_7': [185, 7, 462, '2', 'Winger', '80:00', '8', '2', '-', '-', '-', '-', '-', '26', '8', '89', '25', '29', '1', '-', '-', '-', '-', '-', '5', '4.39s', '-', '-', '-', '2', '-', '1', '13', '0.13', '57.1%', '4', '3', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',

In [111]:
year_dict.keys()
errors = year_dict['errors']
year_dict.pop('errors')

['https://www.nrl.com/draw/nrl-premiership/2018/finals-week-2/sharks-vs-panthers/',
 'https://www.nrl.com/draw/nrl-premiership/2018/finals-week-2/rabbitohs-vs-dragons/',
 'https://www.nrl.com/draw/nrl-premiership/2018/finals-week-3/storm-vs-sharks/',
 'https://www.nrl.com/draw/nrl-premiership/2018/finals-week-3/roosters-vs-rabbitohs/']

In [128]:
year_df = pd.DataFrame()
for match in year_dict.keys():
    match_df = pd.DataFrame.from_dict(year_dict[match], orient='index', columns=column_names).reset_index()
    year_df = year_df.append(match_df, ignore_index = True)
year_df = year_df.replace('-', 0).replace({pd.np.nan: 0})
year_df['average_play_the_ball_seconds'] = year_df['average_play_the_ball_seconds'].str.replace('s', '').astype('float')

# for column in ['minutes_played', 'stint_one', 'stint_two']:
#     year_df[column] = year_df[column].str.replace(':', '.').astype('float')
year_df

# for column in ['conversion_percentage', 'tackle_percentage']:
#     year_df[column] = year_df[column].str.replace('%', '').astype('float') / 100
#     year_df[column] = year_df[column].round(3)
#     for column in ['minutes_played', 'stint_one', 'stint_two']:
#         year_df[column] = year_df[column].str.replace(':', '.').astype('float')

# year_df['average_play_the_ball_seconds'] = year_df['average_play_the_ball_seconds'].str.replace('s', '').astype('float')
# year_df['minutes_played'] = year_df['minutes_played'].str.replace(':00', '').astype(int)
# year_df['position_id'] = year_df['position'].apply(find_position_id)
year_df

Unnamed: 0,index,player_id,team_id,match_id,number,position,minutes_played,points,tries,conversions,...,kicked_dead,errors,handling_errors,one_on_ones_lost,penalties,on_report,sin_bins,send_offs,stint_one,stint_two
0,Billy_Slater_7,568.0,7,462,1,Fullback,80.0,0,0,0,...,0,1,0,0,0,0,0,0,80.00,
1,Suliasi_Vunivalu_7,185.0,7,462,2,Winger,80.0,8,2,0,...,0,2,0,0,0,0,0,0,80.00,
2,Cheyse_Blair_7,623.0,7,462,3,Centre,80.0,8,2,0,...,0,0,0,0,0,0,0,0,80.00,
3,Curtis_Scott_7,177.0,7,462,4,Centre,80.0,4,1,0,...,0,0,0,0,1,0,0,0,80.00,
4,Josh_Addo-carr_7,179.0,7,462,5,Winger,80.0,0,0,0,...,0,3,2,0,1,0,0,0,80.00,
5,Cameron_Munster_7,175.0,7,462,6,Five-Eighth,80.0,1,0,0,...,1,0,0,0,0,0,0,0,80.00,
6,Brodie_Croft_7,174.0,7,462,19,Halfback,80.0,0,0,0,...,0,0,0,0,0,0,0,0,80.00,
7,Jesse_Bromwich_7,164.0,7,462,8,Prop,55.0,0,0,0,...,0,1,1,0,0,0,0,0,25.49,28.38
8,Cameron_Smith_7,159.0,7,462,9,Hooker,80.0,8,0,3,...,0,0,0,0,2,0,0,0,80.00,
9,Tim_Glasby_7,0.0,7,462,10,Prop,33.0,0,0,0,...,0,0,0,0,0,0,0,0,26.46,6.27


In [None]:
    csv_data = pd.DataFrame.from_dict(player_match_stats, orient='index').replace('-', 0)

#     csv_data = csv_data.replace({pd.np.nan: 0})
#         #clean csv_data
#     for column in ['conversion_percentage', 'tackle_percentage']:
#         csv_data[column] = csv_data[column].str.replace('%', '').astype('float') / 100
#         csv_data[column] = csv_data[column].round(3)

#     for column in ['minutes_played', 'stint_one', 'stint_two']:
#         csv_data[column] = csv_data[column].str.replace(':', '.').astype('float')

#     csv_data['average_play_the_ball_seconds'] = csv_data['average_play_the_ball_seconds'].str.replace('s', '').astype('float')
#             #csv_data['minutes_played'] = csv_data['minutes_played'].str.replace(':00', '').astype(int)
#     csv_data['position_id'] = csv_data['position'].apply(find_position_id)

    print(csv_data)