In [100]:
import pandas as pd
import numpy as np
import datetime
from bs4 import BeautifulSoup
import requests as rqs
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from scipy.special import erf
from scipy import integrate

In [4]:
def single_team(df) :
    if len(df) == 1: # If player played for a single team, do nothing
        return df
    else: # If player has moved, return total stats with last team
        row = df[df['Tm'] == 'TOT'].copy()
        row['Tm'] = [str(df['Tm'].iloc[-1])]
        return row

def SoRareScore(df):

    df = df.apply(pd.to_numeric, errors = 'ignore')
    scores = []
    for pts, reb, ast, blk, stl, tov, fg_3 in zip(df['PTS'], df['TRB'], df['AST'], df['BLK'], df['STL'], df['TOV'], df['3P']) :
        
        score = 0
        if pts not in ['Inactive', 'Did Not Play', 'Did Not Dress', 'Not With Team', 'Player Suspended']:
            
            pts, reb, ast, blk, stl, tov, fg_3 = int(pts), int(reb), int(ast), int(blk), int(stl), int(tov), int(fg_3)

            score = 1 * pts + 1.2 * reb + 1.5 * ast + 3 * blk + 3 * stl + 1 * fg_3 - 2 * tov

            doubles = 0 # Count double-digits stats
            if pts >= 10 :
                doubles += 1
            if reb >= 10 :
                doubles += 1
            if ast >= 10 :
                doubles += 1
            if blk >= 10 :
                doubles += 1
            if stl >= 10 :
                doubles += 1

            if doubles == 2 : # Double-double
                score += 1

            if doubles >= 3 : # Tripe-double
                score += 2
        
        else :
            score = 'dnp'

        scores.append(score)
    return scores

def mins_played(str) :
    min, sec = str.split(':')
    return float(min) + float(sec) / 60

In [5]:
# Get list of player hrefs
url = f'https://www.basketball-reference.com/leagues/NBA_2023_per_game.html'
page = rqs.get(url)
soup = BeautifulSoup(page.content, 'html')
while soup.find('tr', class_ = 'thead') is not None :
    soup.find('tr', class_ = 'thead').decompose()

table = soup.find('table')
data_live = pd.read_html(str(table))[0][['Player', 'Tm', 'PTS']]
hrefs = []
for x in table.find_all('a', href = True):
    if x['href'].split('/')[1] == 'players' :
        hrefs.append(x['href'][:-5])
data_live['href'] = hrefs
data_live = data_live.groupby('Player').apply(single_team)
data_live = data_live.sort_values('PTS', ascending = False).reset_index(drop = True)

data_live.head(10)

Unnamed: 0,Player,Tm,PTS,href
0,Joel Embiid,PHI,33.3,/players/e/embiijo01
1,Luka Dončić,DAL,33.1,/players/d/doncilu01
2,Shai Gilgeous-Alexander,OKC,31.2,/players/g/gilgesh01
3,Giannis Antetokounmpo,MIL,31.1,/players/a/antetgi01
4,Jayson Tatum,BOS,30.2,/players/t/tatumja01
5,Kevin Durant,BRK,30.0,/players/d/duranke01
6,Stephen Curry,GSW,29.6,/players/c/curryst01
7,Donovan Mitchell,CLE,29.0,/players/m/mitchdo01
8,Damian Lillard,POR,28.3,/players/l/lillada01
9,Anthony Davis,LAL,28.1,/players/d/davisan02


In [6]:
months = ['october', 'november', 'december', 'january', 'february', 'march', 'april']

monthly_schedule_list = []
for month in months:
    url = f'https://www.basketball-reference.com/leagues/NBA_2023_games-{month}.html'
    page = rqs.get(url)
    soup = BeautifulSoup(page.content, 'html')
    while soup.find('tr', class_ = 'thead') is not None :
        soup.find('tr', class_ = 'thead').decompose()

    table = soup.find('table')
    monthly_schedule = pd.read_html(str(table))[0]

    home_tm, away_tm = [], []
    for i, x in enumerate(table.find_all('a', href = True), start = 1):
        if x['href'].split('/')[1] == 'teams' :
            if i % 2 == 1 :
                home_tm.append((x['href'][7:10]))
            else :
                away_tm.append((x['href'][7:10]))

    monthly_schedule = monthly_schedule.assign(home = home_tm, away = away_tm)[['Date', 'home', 'away']]
    monthly_schedule['Date'] = monthly_schedule['Date'].apply(lambda x: datetime.datetime.strptime(x, '%a, %b %d, %Y'))
    monthly_schedule_list.append(monthly_schedule)

schedule = pd.concat(monthly_schedule_list).reset_index(drop = True)

gw = 1
dates, gws = [], []
date = datetime.datetime(2022, 10, 21)
while date < datetime.datetime(2023, 5, 10):
    dates.append(date)
    gws.append(gw)
    if date.weekday() in [3, 6]:
        gw = gw + 1

    date = date + datetime.timedelta(days = 1)

gameweeks = pd.DataFrame(zip(dates, gws), columns = ['Date', 'Gameweek'])
schedule = schedule.merge(gameweeks, on = 'Date')

schedule.head()

Unnamed: 0,Date,home,away,Gameweek
0,2022-10-21,CHO,NOP,1
1,2022-10-21,IND,SAS,1
2,2022-10-21,WAS,CHI,1
3,2022-10-21,ATL,ORL,1
4,2022-10-21,BRK,TOR,1


In [171]:
features = ['form_1', 'form_2', 'form_5', 'form_10', 'mean_score', 'Rest']

def my_teams(my_players, gameweek, data_train) :

    my_data = data_live[data_live['Player'].isin(my_players)]

    my_players_info = []
    for player, href in zip(my_data['Player'], my_data['href']) :

        url = f'https://www.basketball-reference.com{href}/gamelog/2023'

        page = rqs.get(url)
        soup = BeautifulSoup(page.content, 'html')
        while soup.find('tr', class_ = 'thead') is not None :
            soup.find('tr', class_ = 'thead').decompose()

        table = soup.find('table', id = 'pgl_basic')
        df = pd.read_html(str(table))[0][['Date', 'Tm', 'G', 'MP', 'PTS', 'TRB', 'AST', 'BLK', 'STL', 'TOV', '3P']]
        df.insert(0, 'Player', len(df) * [player])
        df = df.apply(pd.to_numeric, errors = 'ignore')
        df['SORARE'] = SoRareScore(df)
        df['Date'] = df['Date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
        df = df.sort_values('Date')
        df['SORARE'] = df['SORARE'].replace('dnp', np.nan)
        #df['Rest'] = [np.nan] + [(t1 - t0).days for t1, t0 in zip(df.Date[1:], df.Date[:-1])]

        tm = df['Tm'].iloc[-1]
        player_schedule = schedule[(schedule['home'] == tm) | (schedule['away'] == tm)]

        away_games = player_schedule.rename(columns = {'away' : 'Tm'}).merge(df, on = ['Date', 'Tm'], how = 'left')
        home_games = player_schedule.rename(columns = {'home' : 'Tm'}).merge(df, on = ['Date', 'Tm'], how = 'left')

        away_games = away_games[away_games['Tm'] == tm].rename(columns = {'home' : 'opp'})
        home_games = home_games[home_games['Tm'] == tm].rename(columns = {'away' : 'opp'})

        
        df = pd.concat([home_games, away_games]).sort_values('Date')

        df['Rest'] = [np.nan] + [(t1 - t0).days for t1, t0 in zip(df.Date[1:], df.Date[:-1])]
        df = df[df['Gameweek'] <= gameweek]
        df_past = df[df['Player'].notna()]
        df_pred = df[df['Player'].isna()]

        form_1 = df_past['SORARE'].tail(1).mean()
        form_2 = df_past['SORARE'].tail(2).mean()
        form_5 = df_past['SORARE'].tail(5).mean()
        form_10 = df_past['SORARE'].tail(10).mean()
        cost = round(df_past['SORARE'].tail(10).mean())
        form_25 = df_past['SORARE'].tail(25).mean()
        rest = df_pred.Rest.mean()
        stdev = df_past['SORARE'].std()
        
        player_form = pd.DataFrame([[player, cost, stdev, gameweek, list(df_pred['opp']), len(df_pred), form_1, form_2, form_5, form_10, form_25, rest]],
                                                columns = ['Player', 'Cost', 'Std.', 'Gameweek', 'Against', 'No. of games'] + features)

        my_players_info.append(player_form)
        
    my_data = pd.concat(my_players_info).reset_index(drop = True)


    model = LinearRegression()
    model.fit(data_train[features], data_train['SORARE'])
    my_data = my_data.assign(pred_raw = model.predict(my_data[features]))
    my_data['multiplier'] = [multiplier(n, mean, std) for n, mean, std in zip(my_data['No. of games'], my_data['mean_score'], my_data['Std.'])]
    my_data['pred'] = my_data['multiplier'] * my_data['pred_raw']

    return my_data

In [172]:
data_train = pd.read_csv('training_data.csv')
with open('my_players.txt', 'r') as file:
    my_players = file.read().splitlines()

gameweek = 17

df_past = my_teams(my_players, gameweek, data_train)

In [173]:
df_past

Unnamed: 0,Player,Cost,Std.,Gameweek,Against,No. of games,form_1,form_2,form_5,form_10,mean_score,Rest,pred_raw,multiplier,pred
0,Luka Dončić,59,11.111659,17,"[CLE, POR, CLE]",3,59.2,59.2,58.525,59.277778,60.047826,1.666667,57.378586,1.156595,66.363771
1,Kevin Durant,51,9.00557,17,"[TOR, DET]",2,52.8,52.8,48.075,50.588889,48.95,3.0,47.897634,1.1038,52.869398
2,Jaylen Brown,48,10.996034,17,"[ORL, ORL]",2,55.5,43.65,44.82,47.588889,42.095652,2.5,42.844885,1.147363,49.158642
3,Nikola Jokić,55,11.483556,17,"[WAS, LAL, CHO]",3,78.4,64.95,57.24,54.93,50.640909,2.666667,52.216926,1.191902,62.237458
4,Jerami Grant,40,11.640697,17,"[SAS, DAL, HOU]",3,32.6,29.5,33.82,40.06,35.883333,1.666667,35.414725,1.274547,45.137736
5,Lauri Markkanen,39,12.190827,17,"[NOP, MIL]",2,38.7,38.7,37.95,38.828571,38.081818,2.0,37.660299,1.180616,44.462348
6,Jaylen Nowell,23,11.078564,17,"[LAC, OKC, CHI]",3,22.6,15.8,22.0,23.46,19.028,2.0,20.900486,1.492748,31.199148
7,Larry Nance Jr.,23,12.617861,17,"[UTA, PHO]",2,17.9,33.15,29.7,22.888889,23.682609,2.0,25.045978,1.300575,32.574163
8,Jarred Vanderbilt,28,8.739777,17,"[NOP, MIL]",2,48.8,35.9,30.9,27.69,24.256522,2.0,27.656342,1.203264,33.277881
9,Naz Reid,20,9.981206,17,"[LAC, OKC, CHI]",3,15.3,21.45,24.14,19.866667,16.95,2.0,19.403397,1.498348,29.073042


In [164]:
def multiplier(n, mean, std) :
    # Define PDF of distribution of Y = max{X1, X2, ..., Xn}, where Xi 〜 N(mean, std^2)
    def max_PDF(y, n, mean, std) :
        f = n * np.exp(- (y - mean)**2 / (2 * std*std)) / np.sqrt(2 * np.pi * std*std) * (0.5 + 0.5 * erf((y - mean) / np.sqrt(2 * std*std))) ** (n - 1)
        return f
    # Calculte the mean of the distribution of the max
    E_max = round(integrate.quad(lambda x: x * max_PDF(x, n, mean, std), -np.inf, np.inf)[0], 3)
    # Calculate the value of the score multiplier 
    return E_max / mean


In [30]:
url = 'https://www.espn.co.uk/nba/injuries'
page = rqs.get(url)
soup = BeautifulSoup(page.content, 'html')

tables = soup.find_all('table')
injuries = pd.concat(pd.read_html(str(tables))).reset_index(drop = True)
injuries.head()

Unnamed: 0,NAME,POS,DATE,STATUS,COMMENT
0,Trent Forrest,G,4 Dec,Out,Forrest (concussion) has been ruled out for Mo...
1,De'Andre Hunter,SF,2 Dec,Out,Hunter will miss at least a week due to a righ...
2,John Collins,PF,2 Dec,Out,Collins will miss at least two weeks due to a ...
3,Al Horford,C,6 Dec,Day-To-Day,
4,Malcolm Brogdon,PG,6 Dec,Day-To-Day,


In [34]:
for table in tables:
    for x in table.find_all('a', href = True):
        print(x)

<a class="AnchorLink" href="https://www.espn.co.uk/nba/player/_/id/4065656/trent-forrest" tabindex="0">Trent Forrest</a>
<a class="AnchorLink" href="https://www.espn.co.uk/nba/player/_/id/4065732/deandre-hunter" tabindex="0">De'Andre Hunter</a>
<a class="AnchorLink" href="https://www.espn.co.uk/nba/player/_/id/3908845/john-collins" tabindex="0">John Collins</a>
<a class="AnchorLink" href="https://www.espn.co.uk/nba/player/_/id/3213/al-horford" tabindex="0">Al Horford</a>
<a class="AnchorLink" href="https://www.espn.co.uk/nba/player/_/id/2566769/malcolm-brogdon" tabindex="0">Malcolm Brogdon</a>
<a class="AnchorLink" href="https://www.espn.co.uk/nba/player/_/id/4066211/robert-williams-iii" tabindex="0">Robert Williams III</a>
<a class="AnchorLink" href="https://www.espn.co.uk/nba/player/_/id/3428/danilo-gallinari" tabindex="0">Danilo Gallinari</a>
<a class="AnchorLink" href="https://www.espn.co.uk/nba/player/_/id/3136485/edmond-sumner" tabindex="0">Edmond Sumner</a>
<a class="AnchorLink"

In [41]:
data_live.merge(injuries.rename(columns = {'NAME': 'Player'}), on = 'Player', how = 'outer').head(25)

Unnamed: 0,Player,Tm,PTS,href,POS,DATE,STATUS,COMMENT
0,Luka Dončić,DAL,33.4,/players/d/doncilu01,,,,
1,Giannis Antetokounmpo,MIL,31.9,/players/a/antetgi01,,,,
2,Joel Embiid,PHI,31.9,/players/e/embiijo01,,,,
3,Shai Gilgeous-Alexander,OKC,31.3,/players/g/gilgesh01,,,,
4,Jayson Tatum,BOS,30.8,/players/t/tatumja01,,,,
5,Stephen Curry,GSW,30.0,/players/c/curryst01,,,,
6,Kevin Durant,BRK,29.9,/players/d/duranke01,,,,
7,Anthony Davis,LAL,28.6,/players/d/davisan02,PF,5 Dec,Day-To-Day,Davis (back) is probable for Tuesday's game ag...
8,Ja Morant,MEM,28.5,/players/m/moranja01,PG,6 Dec,Day-To-Day,
9,Devin Booker,PHO,28.4,/players/b/bookede01,,,,
