# nba-rating

In [1]:
import pandas as pd
import numpy as np
import requests
import bs4
import time
import random

In [2]:
def allowed_by_robots_txt(url):
    """
    Returns a boolean value representing if a url is allowed 
    to be scraped, according to the site's robots.txt
    ---
    url: string representing url to scrape
    """
    # Get robots.txt
    url_split = url.split("/")
    robots_txt_url = url_split[0] + '//' + url_split[2] + '/robots.txt'

    response = requests.get(robots_txt_url)
    response.raise_for_status()

    lines = response.text.split('\n')

    user_agent_reached = False

    for line in lines:
        if line.strip() == 'User-agent: *':
            user_agent_reached = True
            
        if line.lower().startswith('disallow') and user_agent_reached:
            # Check if the URL is disallowed
            disallowed_path = line.split(':', 1)[1].strip()
            if disallowed_path == '/':
                break
            if disallowed_path in url:
                return False

    # If no specific rule is found, the URL is allowed
    return True

In [3]:
def get_data(url):
    # Make request to site
    response = requests.get(url)
    
    # Check to see if response was successful
    if response.status_code == 200:
        html_content = response.text
    else:
        raise Exception(f"Error: Unable to fetch content. Status code: {response.status_code}.")
        
    # create soup object and get only 'tr' tags
    soup = bs4.BeautifulSoup(html_content, features='lxml')
    soup = (soup
            .find('body', class_='bbr')
            .find('div', {'id':'wrap'})
            .find('div', {'id':'content'})
            .find('div', {'id':'all_schedule'})
            .find('div', {'id':'div_schedule'})
            .find('tbody')
            .find_all('tr'))

    date, start, visitor, visitor_pts, home, home_pts, box_score, ot, attend, arena, notes = [], [], [], [], [], [], [], [], [], [], []
    for game in soup:
        date.append(game.find_all('th')[0].text)
        start.append(game.find_all('td')[0].text)
        visitor.append(game.find_all('td')[1].text)
        visitor_pts.append(game.find_all('td')[2].text)
        home.append(game.find_all('td')[3].text)
        home_pts.append(game.find_all('td')[4].text)
        box_score.append('https://www.basketball-reference.com/' + game.find_all('td')[5].find('a').get('href'))
        ot.append(game.find_all('td')[6].text)
        attend.append(game.find_all('td')[7].text)
        arena.append(game.find_all('td')[8].text)
        notes.append(game.find_all('td')[9].text)
        
    data = {'Date': date, 
            'Start Time (ET)': start,
            'Visitor': visitor, 
            'Visitor Points': visitor_pts, 
            'Home': home, 
            'Home Points': home_pts, 
            'Box Score': box_score, 
            'Overtime': ot, 
            'Attendance': attend, 
            'Arena': arena, 
            'Notes': notes}
    return pd.DataFrame(data)

In [4]:
def nba_season(year):
    df = pd.DataFrame()
    months = ['october', 'november', 'december', 'january', 'february', 'march', 'april', 'may', 'june']
    for month in months:
        url = f'https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html'
        try: 
            df = pd.concat([df, get_data(url)])
        except:
            pass
    return df

In [9]:
nba_season('2023')

Empty DataFrame
Columns: []
Index: []


Considerations:
* blowout games
* hidden mmr


In [15]:
df = pd.DataFrame()
for i in range(2013, 2024):
    time.sleep(25)
    temp_df = nba_season(i)
    if temp_df.shape[0] == 0:
        break
    df = pd.concat([df, nba_season(i)])

In [16]:
df