In [1]:
#import packages required
import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
import numpy as np
from functools import reduce

In [2]:
#put all urls for the stats in here
urls = [
    "https://www.pgatour.com/stats/stat.101.html", 
    "https://www.pgatour.com/stats/stat.02567.html", 
    "https://www.pgatour.com/stats/stat.102.html",
    "https://www.pgatour.com/stats/stat.02674.html"
]

#these are the names of the stats, just copy and paste these into columnNames below
stats = ['driving_dist', 'SGoffTheTee', 'drivingAccuracy', 'SGteeToGreen']
#weights need to be in the correct order for the stats above
weights = [10,20,30,40]

columnNames = ['Player','driving_dist', 'SGoffTheTee', 'drivingAccuracy', 'SGteeToGreen']

In [3]:
#function to call the players name and associated rank for each stat
def scrape_money_leaders(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'lxml')
  money = []
  for item in soup.findAll("td", "player-name"):
      money.append(item.text)
  return money

In [4]:
#empty array dfs which will store a dataframe for each stat
dfs = []

for i in range(0,len(urls)):
    stats[i] = scrape_money_leaders(urls[i])
    columns = ['Player']
    stats[i] = pd.DataFrame(stats[i],columns=columns)
    stats[i].insert(1, 'Rank', range(1, 1 + len(stats[i])))
    dfs.append(stats[i])

In [5]:
#then merge each of the dataframes on the equality of the player name
#this means each player will have their name, then their associcated stats in one dataframe
df_final = reduce(lambda left,right: pd.merge(left,right,on='Player'), dfs)
df_final.columns = columnNames
df_final.head()

Unnamed: 0,Player,driving_dist,SGoffTheTee,drivingAccuracy,SGteeToGreen
0,\nTrey Mullinax\n,1,11,178,114
1,\nTony Finau\n,2,36,196,19
2,\nLuke List\n,3,3,177,4
3,\nRory McIlroy\n,4,29,166,39
4,\nBubba Watson\n,5,6,120,47


In [6]:
#this loops throguh each feature and multiplies by by the relevant weight
for i in range(1, len(columnNames)-1):
    df_final[columnNames[i]] = df_final[columnNames[i]]*weights[i-1]
df_final.head()

Unnamed: 0,Player,driving_dist,SGoffTheTee,drivingAccuracy,SGteeToGreen
0,\nTrey Mullinax\n,10,220,5340,114
1,\nTony Finau\n,20,720,5880,19
2,\nLuke List\n,30,60,5310,4
3,\nRory McIlroy\n,40,580,4980,39
4,\nBubba Watson\n,50,120,3600,47


In [7]:
#add the contents of each column to give a total for each player
df_final['Total'] = df_final.sum(axis=1)
df_final.head()

Unnamed: 0,Player,driving_dist,SGoffTheTee,drivingAccuracy,SGteeToGreen,Total
0,\nTrey Mullinax\n,10,220,5340,114,5684
1,\nTony Finau\n,20,720,5880,19,6639
2,\nLuke List\n,30,60,5310,4,5404
3,\nRory McIlroy\n,40,580,4980,39,5639
4,\nBubba Watson\n,50,120,3600,47,3817


In [8]:
#sorts these totals so the lowest totals are at the top
df_predictions = df_final.sort_values(by='Total', ascending=True)
df_predictions.head()

Unnamed: 0,Player,driving_dist,SGoffTheTee,drivingAccuracy,SGteeToGreen,Total
33,\nTommy Fleetwood\n,340,200,750,8,1298
113,\nKyle Stanley\n,1140,520,120,71,1851
99,\nEmiliano Grillo\n,1000,500,390,25,1915
102,\nKevin Streelman\n,1030,440,480,15,1965
44,\nKeegan Bradley\n,450,1080,600,9,2139
