# Fantasy Basketball Point Predictions

In this project we will webscarape data from basketball-reference to use it to make a ML model to predict stats for basketball players.
We will try a model that will use information on previous years as well as where each player is playing now to make more accurate predictions on what his performance for the coming year will be.

In [22]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

def source_per_year(years, url_tail):
    """
    years list of years
    
    This function will access basketball-reference website and get back the info for every year in our list of years
    
    returns dictionary with years as keys and html as data.
    """
    year_dict={}
    for i in years:
        url= "https://www.basketball-reference.com/leagues/NBA_"+str(i)+url_tail
        year_dict[i]=requests.get(url)
    return year_dict

"""
For Player Data
"""
#we use different functions for player and team data as the websites for this information are quite different.

def generate_table(year_dict):
    """
    year_dict dictionary with years as keys and html as data.

    This function will use the data gathered by the function above to create a pandas dataframe of the information we want.

    returns pandas dataframe with info of players on each season considered.
    """
    column_names=[]
    first_time_year=True
    tables={}
    for i in year_dict.keys():
        column_names=[]
        soup = BeautifulSoup(year_dict[i].text, "html.parser")
        rows = soup.find_all("tr", class_="full_table")
        for row in rows:
            data=row.find_all("td")
            new_row={}
            if first_time_year:
                first_time_year=False
                for cell in data:
                    column_names.append(cell["data-stat"])
                table=pd.DataFrame(columns=column_names)
            for cell in data:
                new_row[cell["data-stat"]]=cell.text
            table=table.append(new_row, ignore_index=True)
        first_time_year=True
        tables[i]=table
    return tables

def webscrap_to_csv(years, stats_url_tails):
    """
    years list of years we want to search
    stats_url_tails dictionary where keys are stats we want e.g. player per game, teams per game, etc. and items are url tails on the website
    
    This function combines the previous functions to webscrap the data we want given the stats and years we are interested

    returns tables a nested dictionary that has as keys the stat we are interested on and then has dictionary having years and the table of data of that given year and stat.
    """
    tables={}
    for stat in stats_url_tails:
        year_dict=source_per_year(years, stats_url_tails[stat])
        tables[stat]=generate_table(year_dict)
    for stat in tables:
        for year in tables[stat]:
            tables[stat][year].to_csv("NBA_data/NBA_"+str(year)+"_"+stat+".csv")
    return tables

"""
For Team Data
"""

def generate_table_teams(year_dict):
    """
    year_dict dictionary with years as keys and html as data.

    This function will use the data gathered by the function above to create a pandas dataframe of the information we want.

    returns pandas dataframe with info of players on each season considered.
    """
    from bs4 import Comment
    column_names=[]
    first_time_year=True
    tables={}
    dict_of_tables={}
    j=0
    for i in year_dict.keys():
        soup = BeautifulSoup(year_dict[i].text, "html.parser")
        for comment in soup(text=lambda text: isinstance(text, Comment)):
            tag = BeautifulSoup(comment, 'html.parser')
            comment.replace_with(tag)
        html_tables = soup.find_all("div", class_="overthrow table_container")
        for html_table in html_tables:
            rows=html_table.find_all("tr")
            for row in rows:
                data=row.find_all("td")
                new_row={}
                if first_time_year:
                    first_time_year=False
                    for cell in data:
                        try:
                            column_names.append(cell["data-stat"])
                        except:
                            pass
                    table=pd.DataFrame(columns=column_names)
                for cell in data:
                    try:
                        new_row[cell["data-stat"]]=cell.text
                    except:
                        pass
                table=table.append(new_row, ignore_index=True)
            first_time_year=True
            if html_table.find("caption") != None:
                tables[html_table.find("caption").text]=table
            else:
                j+=1
                tables[j]=table
        dict_of_tables[i]=tables
    return dict_of_tables

def webscrap_to_csv_teams(years, stats_url_tail, stats_filename):
    """
    years list of years we want to search
    stats_url_tails dictionary where keys are stats we want e.g. player per game, teams per game, etc. and items are url tails on the website
    
    This function combines the previous functions to webscrap the data we want given the stats and years we are interested

    returns tables a nested dictionary that has as keys the stat we are interested on and then has dictionary having years and the table of data of that given year and stat.
    """
    tables={}
    year_dict=source_per_year(years, stats_url_tail)
    tables=generate_table_teams(year_dict)
    for year in tables:
        for stat in tables[year]:
            if stat in stats_filename.keys():
                tables[year][stat].to_csv("NBA_data/NBA_"+str(year)+"_"+stats_filename[stat]+".csv")
    return tables


In [24]:
#Here we use the functions made above to create pandas dataframes on the years selected and also save them in csv format.
"""
Players
"""

years=range(2000,2022)
stats_url_tails={"player_per_game":"_per_game.html", "player_per_36_min":"_per_minute.html", "per_100_possesions":"_per_poss.html"}

tables=webscrap_to_csv(years, stats_url_tails)

"""
Teams
"""
stats_filename={"Team Per Game Stats Table":"team_per_game", "Team Per 100 Poss Stats Table": "team_per_100_possesions"}
stats_url_tail=".html#team-stats-per_game::none"

tables_teams=webscrap_to_csv_teams(years, stats_url_tail, stats_filename)

In [14]:
tables.keys()

dict_keys(['player_per_game', 'player_per_36_min', 'per_100_possesions'])

In [3]:
tables["player_per_game"][2020].head()

Unnamed: 0,player,pos,age,team_id,g,gs,mp_per_g,fg_per_g,fga_per_g,fg_pct,...,ft_pct,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g
0,Steven Adams,C,26,OKC,63,63,26.7,4.5,7.6,0.592,...,0.582,3.3,6.0,9.3,2.3,0.8,1.1,1.5,1.9,10.9
1,Bam Adebayo,PF,22,MIA,72,72,33.6,6.1,11.0,0.557,...,0.691,2.4,7.8,10.2,5.1,1.1,1.3,2.8,2.5,15.9
2,LaMarcus Aldridge,C,34,SAS,53,53,33.1,7.4,15.0,0.493,...,0.827,1.9,5.5,7.4,2.4,0.7,1.6,1.4,2.4,18.9
3,Kyle Alexander,C,23,MIA,2,0,6.5,0.5,1.0,0.5,...,,1.0,0.5,1.5,0.0,0.0,0.0,0.5,0.5,1.0
4,Nickeil Alexander-Walker,SG,21,NOP,47,1,12.6,2.1,5.7,0.368,...,0.676,0.2,1.6,1.8,1.9,0.4,0.2,1.1,1.2,5.7


In [4]:
tables_teams[2020]["Team Per Game Stats Table"].head()

KeyError: 'Team Per Game Stats Table'

## Machine Learning Tests

We will next run some tests and models on the data we obtained. One hypothesis I think would be useful to test is if it is easier or more accurate to predict team averages than players. Another is to see if it is easier to predict based on average per game, per minutes or per possesion. 

The idea would then be to use models to predict these stats and then use these together with other models to predict any "missing" information for our task that would be fantasy point predictions.

We will do this in another notebook so that we use the files we already downloaded, as we do not need to keep webscraping as the data does not change.