 # TODO 3: May need to do some data cleaning - reference tutorial
 # TODO 4: Relook at algorithm runtime

# Setup

In [79]:
# Required libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# Mount google drive
from google.colab import drive
drive.mount("/drive")

# Configuration of script - Change these
GOOGLE_FOLDER_PATH = "temp_data"
CSV_FILE_NAME = "nba_data_2018"
SEASON_YEAR = 2018 # e.g. 2018 will give schedule for 2017-2018

Mounted at /drive


# Define constants

In [76]:
NBA_NAMES = {
    "Atlanta Hawks": "ATL",
    "Boston Celtics": "BOS",
    "Cleveland Cavaliers": "CLE",
    "Chicago Bulls": "CHI",
    "Golden State Warriors": "GSW",
    "Los Angeles Clippers": "LAC",
    "Charlotte Hornets": "CHO", # No idea, but basketball reference lists Charlotte as CHO
    "Dallas Mavericks": "DAL",
    "Denver Nuggets": "DEN",
    "Detroit Pistons": "DET",
    "Houston Rockets": "HOU",
    "Los Angeles Lakers": "LAL",
    "Indiana Pacers": "IND",
    "Minnesota Timberwolves": "MIN",
    "Milwaukee Bucks": "MIL",
    "Memphis Grizzlies": "MEM",
    "Miami Heat": "MIA",
    "New York Knicks": "NYK",
    "New Orleans Pelicans": "NOP",
    "Philadelphia 76ers": "PHI",
    "Oklahoma City Thunder": "OKC",
    "Orlando Magic": "ORL",
    "Portland Trail Blazers": "POR",
    "Sacramento Kings": "SAC",
    "Toronto Raptors": "TOR",
    "San Antonio Spurs": "SAS",
    "Brooklyn Nets": "BRK", # Another inconsistency of basketball reference tag
    "Utah Jazz": "UTA",
    "Phoenix Suns": "PHO", # Or PHX
    "Washington Wizards": "WAS" # Alternative is WSH
}

MONTHS = ["october", "november", "december", "january", "february", "march", "april", "may", "june"]
MONTHS_2019_2020 = ["october-2019"] + MONTHS[1:6] + ["july", "august", "september", "october-2020"]
MONTHS_2020_2021 = MONTHS[2:]

if SEASON_YEAR == 2020:
  months_played = MONTHS_2019_2020
elif SEASON_YEAR == 2021:
  months_played = MONTHS_2020_2021
else:
  months_played = MONTHS

GAME_BOXSCORE_HEADERS = ['Team Name/Date played/Timeframe', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+/-']
TIMEFRAMES = ["q1", "q2", "q3", "q4", "h1", "h2"]

# Helper functions

In [81]:
# @return: 2d array of team stats, two rows (one per team) for the specified timeframe
# @param{tree}: The html of the game page
# @param{date_played}: String of date played in human readable format
# @param{timeframe}: String of the timeframe to extract row data from

def retrieve_data_for_game(tree, team_names, date_played, timeframe):
  game_data_row = []
  for name in team_names:
    team_table = tree.findAll("div", {"class": f"section_wrapper toggleable box-{name} box-{name}-{timeframe}"})
    team_total_row = team_table[0]("tfoot")[0]("td")
    team_timeframe_data = [name + " " + date_played + " " + timeframe.upper()] + [td.getText() for td in team_total_row]

    game_data_row.append(np.array(team_timeframe_data)[np.newaxis][0])
  
  return game_data_row

In [22]:
# @return: A mapping of strings representing the date values needed to construct the game url
# @param{game_soup}: The html object of the game row
def extract_values_for_game_url(game_soup):
  date_YYYYMMDD = game_soup.findAll("th")[0]["csk"]
  year = date_YYYYMMDD[:4]
  month = date_YYYYMMDD[4:6]
  day = date_YYYYMMDD[6:8]
  home_tag = date_YYYYMMDD[9:]

  return {
      "year": year,
      "month": month,
      "day": day,
      "home_tag": home_tag
  }

In [23]:
# @return: A mapping of strings representing the date and team names for the scrape of individual game
# @param{game_soup}: The html object of the game row
def extract_values_for_game_scrape(game_soup):
  date_readable = game_soup.findAll("th")[0].getText()
  away_name = game_soup.findAll("td")[1].getText()
  home_name = game_soup.findAll("td")[3].getText()

  return {
      "date": date_readable,
      "away_name": NBA_NAMES[away_name],
      "home_name": NBA_NAMES[home_name]
  }

# Scraping basketball-reference

### First, we extract all html rows for games that have been played into a list season_games.

In [37]:
season_games = []

for month in months_played:
  schedule_url = f"https://www.basketball-reference.com/leagues/NBA_{SEASON_YEAR}_games-{month}.html"
  html_schedule = urlopen(schedule_url)
  soup_schedule = BeautifulSoup(html_schedule, features="lxml")

  game_rows_for_month = soup_schedule.findAll("tbody")[0]("tr")
  season_games += game_rows_for_month

# Clean an uneeded header row. - TODO: Find a better way to do this.
for i in range(len(season_games) - 1):
  if season_games[i].find("th").getText() == "Playoffs":
    print(season_games[i].find("th").getText() + " removed")
    season_games = season_games[:i] + season_games[i+1:]

print(len(season_games) " games retrieved.")

Playoffs removed


### Main logic below. For each table row representing a game, we need to construct the URL to the specific boxscore game, and then scrape for team totals. We will have 12 rows of data per boxscore game, 6 timeframes for two teams.

Each row will be appended to the dataframe.

In [None]:
results = []

# Loop through all games played in season
# Runtime: (# of games)(6 timeframes)(2 teams)
for game_html_row in season_games:
  # Extract values for dynamic URL construction
  game_url_metadata = extract_values_for_game_url(game_html_row)

  year = game_url_metadata["year"]
  month = game_url_metadata["month"]
  day = game_url_metadata["day"]
  home_tag = game_url_metadata["home_tag"]

  # Setup game url
  game_url = f"https://www.basketball-reference.com/boxscores/{year}{month}{day}0{home_tag}.html"
  html_game = urlopen(game_url)
  soup_game = BeautifulSoup(html_game, features="lxml")

  # Extract date and team names from game row
  game_scrape_metadata = extract_values_for_game_scrape(game_html_row)

  # print(home_tag + year + month + day)
  for timeframe in TIMEFRAMES:
    rows_timeframe = retrieve_data_for_game(soup_game, [game_scrape_metadata["away_name"], game_scrape_metadata["home_name"]], game_scrape_metadata["date"], timeframe)
    results += rows_timeframe

print(len(results) + " rows generated")

# Finally, export the results as a dataframe to a CSV file.

In [80]:
data_export = pd.DataFrame(results, columns = GAME_BOXSCORE_HEADERS)
data_export.head()
data_export.to_csv(r"/drive/My Drive/{google_folder_path}/{csv_file_name}.csv".format(google_folder_path=GOOGLE_FOLDER_PATH, csv_file_name=CSV_FILE_NAME), index=False, encoding="utf-8")

# Ignore - Cell below is for manual debugging of pages. Uncomment if needed.

In [68]:
# # Open URL
# hardcode_page = "https://www.basketball-reference.com/boxscores/201710180DET.html"
# html = urlopen(hardcode_page)
# soup = BeautifulSoup(html, features="lxml")

# # Extract team names and date
# game_metadata = soup.findAll("h1")[0].getText().split(" at ")
# team1_name = NBA_NAMES[game_metadata[0]]
# team2_name = NBA_NAMES[game_metadata[1].split(" Box")[0]]
# date = ', '.join(game_metadata[1].split(", ")[1:])

# # Extract headers
# headers = [th.getText() for th in soup.findAll("tr", limit=2)[1]("th")]
# headers[0] = "Team Name, Date played"

# def retrieve_data_for_game_temp(tree, team_names, date_played):
#   final_rows = []
#   for name in team_names:
#     team_table = tree.findAll("div", {"class": f"section_wrapper toggleable box-{name} box-{name}-q1"}) 
#     team_total_row = team_table[0]("tfoot")[0]("td")
#     team_q1_data = [name + " " + date_played] + [td.getText() for td in team_total_row]

#     final_rows.append(np.array(team_q1_data)[np.newaxis][0])
  
#   return final_rows

# # team_table = soup.findAll("div", {"class": f"section_wrapper toggleable box-CHA box-CHA-q1"})
# # print(team_table) 
# data_for_one_game = retrieve_data_for_game_temp(soup, [team1_name, team2_name], date) # TODO: Temporary output  
# print(data_for_one_game)

[array(['CHO October 18, 2017', '60', '7', '16', '.438', '2', '6', '.333',
       '11', '12', '.917', '1', '11', '12', '4', '0', '1', '4', '3', '27',
       ''], dtype='<U20'), array(['DET October 18, 2017', '60', '13', '25', '.520', '2', '5', '.400',
       '1', '2', '.500', '0', '8', '8', '7', '4', '0', '1', '7', '29', ''],
      dtype='<U20')]
