In [1]:
#import packages required
import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
import numpy as np
from functools import reduce
import urllib
import urllib.request as ur


In [2]:
#categories of stats available 

categories = {"Off the tee":"ROTT","Approach the green":"RAPP","Around the green":"RARG","Putting":"RPUT",
              'Scoring':"RSCR","Streaks":"RSTR","Money/Finishes":"RMNY","Points/Rankings":"RPTS"}
print ("Breakdown of statistics categories:")
for key in categories:
    print (key)

Breakdown of statistics categories:
Off the tee
Approach the green
Around the green
Putting
Scoring
Streaks
Money/Finishes
Points/Rankings


In [3]:
def category_links(category):
    
    pga_link = "http://www.pgatour.com/stats/categories."+categories[category]+"_INQ.html"
    pga_link = ur.urlopen(pga_link)
    soup = BeautifulSoup(pga_link,'lxml')
    links = []
    clean_links = {}

    for element in soup.find_all('div',class_="table-content clearfix"):
        for link in element.findAll('a'):
            test = link.get('href')
            clean_links[link.find(text=True)] = test.replace("/content/pgatour", "http://www.pgatour.com")

    return clean_links;


In [4]:
#input your category and will return stats available with corresponding link
#enter catregory in inverted commas, eg "Scoring"

category_links("Approach the green")

{'Approach 100-125 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.028.html',
 'Approach 125-150 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.473.html',
 'Approach 150-175 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.478.html',
 'Approach 175-200 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.479.html',
 'Approach 200-225 yards (RTP)': 'http://www.pgatour.com/stats/stat.02376.html',
 'Approach 225-250 yards (RTP)': 'http://www.pgatour.com/stats/stat.02377.html',
 'Approach 250-275 yards (RTP)': 'http://www.pgatour.com/stats/stat.02378.html',
 'Approach 50-75 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.030.html',
 'Approach 75-100 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.029.html',
 'Approach < 125 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.472.html',
 'Approach > 200 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.480.html',
 'Approach > 275 yards (RTP)': 'http://www.pgatour.com/stats/stat.02379.html',
 'App

In [5]:
#put all urls for the stats in here
urls = [ 
    "https://www.pgatour.com/stats/stat.02567.html", 
    "https://www.pgatour.com/stats/stat.02674.html",
    "https://www.pgatour.com/stats/stat.142.html",
    "http://www.pgatour.com/stats/stat.143.html",
    "http://www.pgatour.com/stats/stat.158.html",
    "https://www.pgatour.com/stats/stat.102.html",
    "https://www.pgatour.com/stats/stat.02428.html",
    "https://www.pgatour.com/stats/stat.130.html",
    "https://www.pgatour.com/stats/stat.02569.html",
    "https://www.pgatour.com/stats/stat.437.html",
    "https://www.pgatour.com/stats/stat.360.html"]

#these are the names of the stats, just copy and paste these into columnNames below
stats = [ 'SGoffTheTee','SGteeToGreen',"Par3 Scoring_Avergage",
         "Par4 Scoring_Avergage","Ball Striking","Driving Accuracy",
         "Total Putting","Scrambling", "SG:AroundTheGreen",
         "Rough Proximity","Birdie/Better_125-150"]

columnNames = [ "Player",'SGoffTheTee',  'SGteeToGreen',
         "Par3 Scoring_Avergage","Par4 Scoring_Avergage","Ball Striking",
        "Driving Accuracy","Total Putting","Scrambling",
        "SG:AroundTheGreen","Rough Proximity","Birdie/Better_125-150"]
#weights need to be in the correct order for the stats above
weights = [1,1,1,1,3,3,2,2,1,1,1]

stats


['SGoffTheTee',
 'SGteeToGreen',
 'Par3 Scoring_Avergage',
 'Par4 Scoring_Avergage',
 'Ball Striking',
 'Driving Accuracy',
 'Total Putting',
 'Scrambling',
 'SG:AroundTheGreen',
 'Rough Proximity',
 'Birdie/Better_125-150']

In [6]:
#function to call the players name and associated rank for each stat
def scrape_money_leaders(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'lxml')
  money = []
  for item in soup.findAll("td", "player-name"):
      money.append(item.text)
  return money

In [7]:
#empty array dfs which will store a dataframe for each stat
dfs = []

for i in range(0,len(urls)):
    stats[i] = scrape_money_leaders(urls[i])
    columns = ['Player']
    stats[i] = pd.DataFrame(stats[i],columns=columns)
    stats[i].insert(1, 'Rank', range(1, 1 + len(stats[i])))
    dfs.append(stats[i])

#then merge each of the dataframes on the equality of the player name
#this means each player will have their name, then their associcated stats in one dataframe
df_final = reduce(lambda left,right: pd.merge(left,right,on='Player'), dfs)
df_final.columns = columnNames

#this loops throguh each feature and multiplies by by the relevant weight
for i in range(1, len(columnNames)-1):
    df_final[columnNames[i]] = df_final[columnNames[i]]*weights[i-1]

#add the contents of each column to give a total for each player
df_final['Total'] = df_final.sum(axis=1)

#sorts these totals so the lowest totals are at the top
df_stats = df_final.sort_values(by='Total', ascending=True)
df_stats = df_stats.replace(to_replace=r'\n', value='', regex=True)

In [8]:
#temporary scrape of field list - will look to generalise this
field_link = "http://www.deananddelucainvitational.com/players/committed-players"
field_page = ur.urlopen(field_link)
field_soup = BeautifulSoup(field_page,'lxml')

player_list = field_soup.find_all("span", class_="field-content")

field =[]

for player in  player_list:
    field.append(player.text)
    
field = pd.DataFrame(data=field)
field.columns = ["Player"]

field = field.replace(to_replace=r'  ', value=' ', regex=True)

In [9]:
#stats of players playing in the event
df_predictions = pd.merge(field,df_stats,on="Player")
df_predictions = df_predictions.sort_values(by='Total', ascending=True)
df_predictions

Unnamed: 0,Player,SGoffTheTee,SGteeToGreen,Par3 Scoring_Avergage,Par4 Scoring_Avergage,Ball Striking,Driving Accuracy,Total Putting,Scrambling,SG:AroundTheGreen,Rough Proximity,Birdie/Better_125-150,Total
75,Chez Reavie,48,40,4,11,129,9,68,26,130,56,21,542
27,Emiliano Grillo,25,24,8,17,21,51,74,100,129,200,1,650
94,Kevin Streelman,22,15,108,16,18,60,330,20,43,4,124,760
90,Jordan Spieth,18,2,52,8,51,255,394,6,13,13,4,816
77,Justin Rose,21,30,92,7,36,249,162,52,54,67,53,823
22,Rickie Fowler,63,28,58,5,39,165,300,12,26,180,12,888
39,Zach Johnson,89,23,3,20,276,228,148,84,29,51,7,958
13,Austin Cook,77,87,6,74,243,129,20,86,44,31,169,966
32,Brian Harman,60,81,132,19,111,63,184,178,112,5,37,982
16,Bryson DeChambeau,14,12,113,30,30,240,172,200,106,60,59,1036


In [18]:
df_predictions.isnull().any().any()

False

In [10]:
#players in the field we dont have stats for
field_players = set(field["Player"])
stats_players = set(df_stats["Player"])

print(field_players - stats_players)

{'David     Hearn', 'John Senden', 'Keith Clearwater', 'Brooks Koepka', 'Maverick McNealy', 'Joaquin Niemann', 'Shubhankar Sharma', 'Satoshi Kodaira', 'J. J. Henry', 'Jim Furyk', 'Steve Stricker', 'Tim Herron', 'Ted Potter Jr.'}


In [11]:
#search for player within field
df_predictions.loc[df_predictions['Player'] == "Brooks Koepka"]

Unnamed: 0,Player,SGoffTheTee,SGteeToGreen,Par3 Scoring_Avergage,Par4 Scoring_Avergage,Ball Striking,Driving Accuracy,Total Putting,Scrambling,SG:AroundTheGreen,Rough Proximity,Birdie/Better_125-150,Total


In [12]:
#search for player outside of field
df_stats.loc[df_stats['Player'] == "Ted Potter, Jr."]

Unnamed: 0,Player,SGoffTheTee,SGteeToGreen,Par3 Scoring_Avergage,Par4 Scoring_Avergage,Ball Striking,Driving Accuracy,Total Putting,Scrambling,SG:AroundTheGreen,Rough Proximity,Birdie/Better_125-150,Total
123,"Ted Potter, Jr.",124,129,190,165,507,117,186,164,41,40,79,1742


In [13]:
#save as csv
df_predictions.to_csv( "predictions.csv")


In [16]:
df_predictions.reset_index(drop=True)

Unnamed: 0,Player,SGoffTheTee,SGteeToGreen,Par3 Scoring_Avergage,Par4 Scoring_Avergage,Ball Striking,Driving Accuracy,Total Putting,Scrambling,SG:AroundTheGreen,Rough Proximity,Birdie/Better_125-150,Total
0,Chez Reavie,48,40,4,11,129,9,68,26,130,56,21,542
1,Emiliano Grillo,25,24,8,17,21,51,74,100,129,200,1,650
2,Kevin Streelman,22,15,108,16,18,60,330,20,43,4,124,760
3,Jordan Spieth,18,2,52,8,51,255,394,6,13,13,4,816
4,Justin Rose,21,30,92,7,36,249,162,52,54,67,53,823
5,Rickie Fowler,63,28,58,5,39,165,300,12,26,180,12,888
6,Zach Johnson,89,23,3,20,276,228,148,84,29,51,7,958
7,Austin Cook,77,87,6,74,243,129,20,86,44,31,169,966
8,Brian Harman,60,81,132,19,111,63,184,178,112,5,37,982
9,Bryson DeChambeau,14,12,113,30,30,240,172,200,106,60,59,1036


In [31]:
df_predictions.loc[df_predictions['Player'] == "Kevin Na"]

Unnamed: 0,Player,SGoffTheTee,SGteeToGreen,Par3 Scoring_Avergage,Par4 Scoring_Avergage,Ball Striking,Driving Accuracy,Total Putting,Scrambling,SG:AroundTheGreen,Rough Proximity,Birdie/Better_125-150,Total
62,Kevin Na,189,63,25,100,447,153,158,18,1,175,132,1461


In [22]:
#field strength
def scrape_owgr(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'lxml')
  ranks = []
  for item in soup.findAll("td", "name"):
      ranks.append(item.text)
  return ranks

In [74]:
owgr = scrape_owgr("http://www.owgr.com/ranking")
columns = ['Player']
df_owgr = pd.DataFrame(owgr,columns=columns)

df_owgr.insert(1, 'Rank', range(1, 1 + len(df_owgr)))
df_owgr_field = pd.concat([df_owgr.set_index('Player'),field.set_index('Player')], axis=1, join='inner').reset_index()
#add all the ranks
Total = df_owgr_field['Rank'].sum()
avgRank = Total / len(df_owgr_field.index)
#print the avg rank for the field
print(avgRank.round())
#shows some level of strength of field, could be good to see whether players perform if other good players playing
#only considers top 100 ranked players though

49.0
