In [20]:
#import packages required
import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
import numpy as np
from functools import reduce
import urllib
import urllib.request as ur


In [21]:
#categories of stats available 

categories = {"Off the tee":"ROTT","Approach the green":"RAPP","Around the green":"RARG","Putting":"RPUT",
              'Scoring':"RSCR","Streaks":"RSTR","Money/Finishes":"RMNY","Points/Rankings":"RPTS"}
print ("Breakdown of statistics categories:")
for key in categories:
    print (key)

Breakdown of statistics categories:
Off the tee
Approach the green
Around the green
Putting
Scoring
Streaks
Money/Finishes
Points/Rankings


In [22]:
def category_links(category):
    
    pga_link = "http://www.pgatour.com/stats/categories."+categories[category]+"_INQ.html"
    pga_link = ur.urlopen(pga_link)
    soup = BeautifulSoup(pga_link,'lxml')
    links = []
    clean_links = {}

    for element in soup.find_all('div',class_="table-content clearfix"):
        for link in element.findAll('a'):
            test = link.get('href')
            clean_links[link.find(text=True)] = test.replace("/content/pgatour", "http://www.pgatour.com")

    return clean_links;


In [23]:
#input your category and will return stats available with corresponding link
#enter catregory in inverted commas, eg "Scoring"

category_links("Approach the green")

{'Approach 100-125 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.028.html',
 'Approach 125-150 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.473.html',
 'Approach 150-175 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.478.html',
 'Approach 175-200 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.479.html',
 'Approach 200-225 yards (RTP)': 'http://www.pgatour.com/stats/stat.02376.html',
 'Approach 225-250 yards (RTP)': 'http://www.pgatour.com/stats/stat.02377.html',
 'Approach 250-275 yards (RTP)': 'http://www.pgatour.com/stats/stat.02378.html',
 'Approach 50-75 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.030.html',
 'Approach 75-100 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.029.html',
 'Approach < 125 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.472.html',
 'Approach > 200 yards (RTP Score)': 'http://www.pgatour.com/stats/stat.480.html',
 'Approach > 275 yards (RTP)': 'http://www.pgatour.com/stats/stat.02379.html',
 'App

In [24]:
#put all urls for the stats in here
urls = [ 
    "https://www.pgatour.com/stats/stat.02567.html", 
    "https://www.pgatour.com/stats/stat.02674.html",
    "https://www.pgatour.com/stats/stat.142.html",
    "http://www.pgatour.com/stats/stat.143.html",
    "http://www.pgatour.com/stats/stat.158.html",
    "https://www.pgatour.com/stats/stat.102.html",
    "https://www.pgatour.com/stats/stat.02428.html",
    "https://www.pgatour.com/stats/stat.130.html",
    "https://www.pgatour.com/stats/stat.02569.html",
    "https://www.pgatour.com/stats/stat.437.html",
    "https://www.pgatour.com/stats/stat.360.html"]

#these are the names of the stats, just copy and paste these into columnNames below
stats = [ 'SGoffTheTee','SGteeToGreen',"Par3 Scoring_Avergage",
         "Par4 Scoring_Avergage","Ball Striking","Driving Accuracy",
         "Total Putting","Scrambling", "SG:AroundTheGreen",
         "Rough Proximity","Birdie/Better_125-150"]

columnNames = [ "Player",'SGoffTheTee',  'SGteeToGreen',
         "Par3 Scoring_Avergage","Par4 Scoring_Avergage","Ball Striking",
        "Driving Accuracy","Total Putting","Scrambling",
        "SG:AroundTheGreen","Rough Proximity","Birdie/Better_125-150"]
#weights need to be in the correct order for the stats above
weights = [1,1,1,1,3,3,2,2,1,1,1]

stats


['SGoffTheTee',
 'SGteeToGreen',
 'Par3 Scoring_Avergage',
 'Par4 Scoring_Avergage',
 'Ball Striking',
 'Driving Accuracy',
 'Total Putting',
 'Scrambling',
 'SG:AroundTheGreen',
 'Rough Proximity',
 'Birdie/Better_125-150']

In [25]:
#function to call the players name and associated rank for each stat
def scrape_money_leaders(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'lxml')
  money = []
  for item in soup.findAll("td", "player-name"):
      money.append(item.text)
  return money

In [26]:
#empty array dfs which will store a dataframe for each stat
dfs = []

for i in range(0,len(urls)):
    stats[i] = scrape_money_leaders(urls[i])
    columns = ['Player']
    stats[i] = pd.DataFrame(stats[i],columns=columns)
    stats[i].insert(1, 'Rank', range(1, 1 + len(stats[i])))
    dfs.append(stats[i])

#then merge each of the dataframes on the equality of the player name
#this means each player will have their name, then their associcated stats in one dataframe
df_final = reduce(lambda left,right: pd.merge(left,right,on='Player'), dfs)
df_final.columns = columnNames

#this loops throguh each feature and multiplies by by the relevant weight
for i in range(1, len(columnNames)-1):
    df_final[columnNames[i]] = df_final[columnNames[i]]*weights[i-1]

#add the contents of each column to give a total for each player
df_final['Total'] = df_final.sum(axis=1)

#sorts these totals so the lowest totals are at the top
df_stats = df_final.sort_values(by='Total', ascending=True)
df_stats = df_stats.replace(to_replace=r'\n', value='', regex=True)

In [27]:
#temporary scrape of field list - will look to generalise this
field_link = "http://www.deananddelucainvitational.com/players/committed-players"
field_page = ur.urlopen(field_link)
field_soup = BeautifulSoup(field_page,'lxml')

player_list = field_soup.find_all("span", class_="field-content")

field =[]

for player in  player_list:
    field.append(player.text)
    
field = pd.DataFrame(data=field)
field.columns = ["Player"]

field = field.replace(to_replace=r'  ', value=' ', regex=True)

In [28]:
#stats of players playing in the event
df_predictions = pd.merge(field,df_stats,on="Player")
df_predictions = df_predictions.sort_values(by='Total', ascending=True)
df_predictions

Unnamed: 0,Player,SGoffTheTee,SGteeToGreen,Par3 Scoring_Avergage,Par4 Scoring_Avergage,Ball Striking,Driving Accuracy,Total Putting,Scrambling,SG:AroundTheGreen,Rough Proximity,Birdie/Better_125-150,Total
28,Emiliano Grillo,28,26,2,12,33,51,28,78,121,196,1,576
77,Chez Reavie,45,39,20,11,135,9,100,50,141,72,8,630
79,Justin Rose,15,7,90,2,21,231,150,46,47,26,29,664
96,Kevin Streelman,25,23,115,18,12,54,326,28,67,3,115,786
92,Jordan Spieth,18,2,38,8,60,300,374,12,8,8,5,833
22,Rickie Fowler,69,38,52,7,81,189,180,14,35,160,23,848
45,Chris Kirk,94,29,26,45,207,174,204,20,38,62,63,962
13,Austin Cook,75,84,5,94,243,126,26,96,43,28,166,986
33,Brian Harman,50,78,137,14,126,63,134,222,128,5,30,987
16,Bryson DeChambeau,12,11,105,21,27,243,222,162,95,44,66,1008


In [29]:
#add odds to the table
def scrape_money_leaders(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    return soup

oddsScrape = scrape_money_leaders("https://m.skybet.com/golf/memorial/event/22317959")

#players
players = oddsScrape.findAll("b", "cell-text__line")

playerNames = []

for player in players:
    playerNames.append(player.text)

df_players = pd.DataFrame(data=playerNames)
df_players.columns = ["Player"]

#odds
odds = oddsScrape.findAll("span", "js-oc-price")

playerOdds = []

for odd in odds:
    playerOdds.append(odd.text)

df_odds = pd.DataFrame(data=playerOdds)
df_odds.columns = ["Odds"]
df_odds = df_odds.replace(to_replace=r'\n            ', value='', regex=True)

#odds and players
odd_players = pd.concat([df_players, df_odds], axis=1, join='inner')

df_predictions1 = pd.merge(df_predictions, odd_players, on="Player")
df_predictions1

Unnamed: 0,Player,SGoffTheTee,SGteeToGreen,Par3 Scoring_Avergage,Par4 Scoring_Avergage,Ball Striking,Driving Accuracy,Total Putting,Scrambling,SG:AroundTheGreen,Rough Proximity,Birdie/Better_125-150,Total,Odds
0,Emiliano Grillo,28,26,2,12,33,51,28,78,121,196,1,576,45/1
1,Justin Rose,15,7,90,2,21,231,150,46,47,26,29,664,14/1
2,Kevin Streelman,25,23,115,18,12,54,326,28,67,3,115,786,125/1
3,Jordan Spieth,18,2,38,8,60,300,374,12,8,8,5,833,18/1
4,Rickie Fowler,69,38,52,7,81,189,180,14,35,160,23,848,20/1
5,Chris Kirk,94,29,26,45,207,174,204,20,38,62,63,962,125/1
6,Austin Cook,75,84,5,94,243,126,26,96,43,28,166,986,200/1
7,Russell Knox,73,44,121,33,75,111,342,128,45,71,45,1088,125/1
8,Zach Johnson,103,31,15,32,339,279,160,88,20,47,18,1132,80/1
9,Adam Hadwin,90,33,70,30,162,159,156,238,42,37,117,1134,100/1


In [30]:
df_predictions.isnull().any().any()

False

In [31]:
#players in the field we dont have stats for
field_players = set(field["Player"])
stats_players = set(df_stats["Player"])

print(field_players - stats_players)

{'Steve Stricker', 'Brooks Koepka', 'Tim Herron', 'Shubhankar Sharma', 'Ted Potter Jr.', 'Joaquin Niemann', 'John Senden', 'David     Hearn', 'Keith Clearwater', 'Maverick McNealy', 'J. J. Henry'}


In [32]:
#search for player within field
df_predictions.loc[df_predictions['Player'] == "Brooks Koepka"]

Unnamed: 0,Player,SGoffTheTee,SGteeToGreen,Par3 Scoring_Avergage,Par4 Scoring_Avergage,Ball Striking,Driving Accuracy,Total Putting,Scrambling,SG:AroundTheGreen,Rough Proximity,Birdie/Better_125-150,Total


In [33]:
#search for player outside of field
df_stats.loc[df_stats['Player'] == "Ted Potter, Jr."]

Unnamed: 0,Player,SGoffTheTee,SGteeToGreen,Par3 Scoring_Avergage,Par4 Scoring_Avergage,Ball Striking,Driving Accuracy,Total Putting,Scrambling,SG:AroundTheGreen,Rough Proximity,Birdie/Better_125-150,Total
115,"Ted Potter, Jr.",116,103,177,148,471,117,242,138,24,36,70,1642


In [34]:
#save as csv
df_predictions.to_csv( "predictions.csv")


In [35]:
df_predictions.reset_index(drop=True)

Unnamed: 0,Player,SGoffTheTee,SGteeToGreen,Par3 Scoring_Avergage,Par4 Scoring_Avergage,Ball Striking,Driving Accuracy,Total Putting,Scrambling,SG:AroundTheGreen,Rough Proximity,Birdie/Better_125-150,Total
0,Emiliano Grillo,28,26,2,12,33,51,28,78,121,196,1,576
1,Chez Reavie,45,39,20,11,135,9,100,50,141,72,8,630
2,Justin Rose,15,7,90,2,21,231,150,46,47,26,29,664
3,Kevin Streelman,25,23,115,18,12,54,326,28,67,3,115,786
4,Jordan Spieth,18,2,38,8,60,300,374,12,8,8,5,833
5,Rickie Fowler,69,38,52,7,81,189,180,14,35,160,23,848
6,Chris Kirk,94,29,26,45,207,174,204,20,38,62,63,962
7,Austin Cook,75,84,5,94,243,126,26,96,43,28,166,986
8,Brian Harman,50,78,137,14,126,63,134,222,128,5,30,987
9,Bryson DeChambeau,12,11,105,21,27,243,222,162,95,44,66,1008


In [36]:
df_predictions.loc[df_predictions['Player'] == "Kevin Na"]

Unnamed: 0,Player,SGoffTheTee,SGteeToGreen,Par3 Scoring_Avergage,Par4 Scoring_Avergage,Ball Striking,Driving Accuracy,Total Putting,Scrambling,SG:AroundTheGreen,Rough Proximity,Birdie/Better_125-150,Total
64,Kevin Na,183,40,9,66,408,177,176,18,1,178,98,1354


In [37]:
#field strength
def scrape_owgr(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'lxml')
  ranks = []
  for item in soup.findAll("td", "name"):
      ranks.append(item.text)
  return ranks

In [38]:
owgr = scrape_owgr("http://www.owgr.com/ranking")
columns = ['Player']
df_owgr = pd.DataFrame(owgr,columns=columns)

df_owgr.insert(1, 'Rank', range(1, 1 + len(df_owgr)))
df_owgr_field = pd.concat([df_owgr.set_index('Player'),field.set_index('Player')], axis=1, join='inner').reset_index()
#add all the ranks
Total = df_owgr_field['Rank'].sum()
avgRank = Total / len(df_owgr_field.index)
#print the avg rank for the field
print(avgRank.round())
#shows some level of strength of field, could be good to see whether players perform if other good players playing
#only considers top 100 ranked players though

49.0
