-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_player_stats.py
37 lines (29 loc) · 1.17 KB
/
scrape_player_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import sys
def get_player_stats(year):
# URL page we will scraping (see image above)
url = "https://www.basketball-reference.com/leagues/NBA_{}_totals.html".format(year)
# this is the HTML from the given URL
html = urlopen(url)
soup = BeautifulSoup(html, features="html.parser")
# use findALL() to get the column headers
soup.findAll('tr', limit=2)
# use getText()to extract the text we need into a list
headers = [th.getText() for th in
soup.findAll('tr', limit=2)[0].findAll('th')]
# exclude the first column as we will not need the
# ranking order from Basketball Reference for the analysis
headers = headers[1:]
# avoid the first header row
rows = soup.findAll('tr')[1:]
player_stats = [[td.getText() for td
in rows[i].findAll('td')]
for i in range(len(rows))]
# Create DataFrame
return pd.DataFrame(player_stats, columns=headers)
if(len(sys.argv) == 2):
get_player_stats(sys.argv[1]).to_csv('./player_stats_{}.csv'.format(sys.argv[1]), index=False)
else:
raise Exception('Year is not in arguments')