In [1]:
import pandas as pd
import numpy as np
import datetime
from bs4 import BeautifulSoup
import requests as rqs
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from scipy.special import erf
from scipy import integrate
import pulp
from IPython.display import clear_output
from time import sleep
from unidecode import unidecode

In [2]:
def single_team(df) :
    if len(df) == 1: # If player played for a single team, do nothing
        return df
    else: # If player has moved, return total stats with last team
        row = df[df['Tm'] == 'TOT'].copy()
        row['Tm'] = [str(df['Tm'].iloc[-1])]
        return row

def multiplier(n, mean, std) :

    if n == 0 : # If there's no game : 0 score prediction
        return 0
    if n == 1 : # If there's 1 game : no multiplier needed
        return 1

    else : # More than 1 game : multiplier required
        # Define PDF of distribution of Y = max{X1, X2, ..., Xn}, where Xi 〜 N(mean, std^2)
        def max_PDF(y, n, mean, std) :
            f = n * np.exp(- (y - mean)**2 / (2 * std*std)) / np.sqrt(2 * np.pi * std*std) * (0.5 + 0.5 * erf((y - mean) / np.sqrt(2 * std*std))) ** (n - 1)
            return f
        # Calculte the mean of the distribution of the max
        E_max = round(integrate.quad(lambda x: x * max_PDF(x, n, mean, std), -np.inf, np.inf)[0], 3)

        # Calculate the value of the score multiplier
        if mean == 0 :
            mean = 1
            
        return E_max / mean

def SoRareScore(df):

    df = df.apply(pd.to_numeric, errors = 'ignore')
    scores = []
    for pts, reb, ast, blk, stl, tov, fg_3 in zip(df['PTS'], df['TRB'], df['AST'], df['BLK'], df['STL'], df['TOV'], df['3P']) :
        
        score = 0
        if pts not in ['Inactive', 'Did Not Play', 'Did Not Dress', 'Not With Team', 'Player Suspended']:
            
            pts, reb, ast, blk, stl, tov, fg_3 = int(pts), int(reb), int(ast), int(blk), int(stl), int(tov), int(fg_3)

            score = 1 * pts + 1.2 * reb + 1.5 * ast + 3 * blk + 3 * stl + 1 * fg_3 - 2 * tov

            doubles = 0 # Count double-digits stats
            if pts >= 10 :
                doubles += 1
            if reb >= 10 :
                doubles += 1
            if ast >= 10 :
                doubles += 1
            if blk >= 10 :
                doubles += 1
            if stl >= 10 :
                doubles += 1

            if doubles == 2 : # Double-double
                score += 1

            if doubles >= 3 : # Tripe-double
                score += 2
        
        else :
            score = 'dnp'

        scores.append(score)
    return scores

In [3]:
months = ['october', 'november', 'december', 'january', 'february', 'march', 'april']

monthly_schedule_list = []
for month in months:
    url = f'https://www.basketball-reference.com/leagues/NBA_2023_games-{month}.html'
    page = rqs.get(url)
    soup = BeautifulSoup(page.content, 'html')
    while soup.find('tr', class_ = 'thead') is not None :
        soup.find('tr', class_ = 'thead').decompose()

    table = soup.find('table')
    monthly_schedule = pd.read_html(str(table))[0]

    home_tm, away_tm = [], []
    for i, x in enumerate(table.find_all('a', href = True), start = 1):
        if x['href'].split('/')[1] == 'teams' :
            if i % 2 == 1 :
                home_tm.append((x['href'][7:10]))
            else :
                away_tm.append((x['href'][7:10]))

    monthly_schedule = monthly_schedule.assign(home = home_tm, away = away_tm)[['Date', 'home', 'away']]
    monthly_schedule['Date'] = monthly_schedule['Date'].apply(lambda x: datetime.datetime.strptime(x, '%a, %b %d, %Y'))
    monthly_schedule_list.append(monthly_schedule)

schedule = pd.concat(monthly_schedule_list).reset_index(drop = True)

gw = 1
dates, gws = [], []
date = datetime.datetime(2022, 10, 21)
while date < datetime.datetime(2023, 5, 10):
    dates.append(date)
    gws.append(gw)
    if date.weekday() in [3, 6]:
        gw = gw + 1

    date = date + datetime.timedelta(days = 1)

gameweeks = pd.DataFrame(zip(dates, gws), columns = ['Date', 'Gameweek'])
schedule = schedule.merge(gameweeks, on = 'Date')

schedule.head()

Unnamed: 0,Date,home,away,Gameweek
0,2022-10-21,CHO,NOP,1
1,2022-10-21,IND,SAS,1
2,2022-10-21,WAS,CHI,1
3,2022-10-21,ATL,ORL,1
4,2022-10-21,BRK,TOR,1


In [5]:
# THE MIGHTY FUNCTION
def my_teams(gameweek, competitions, limited = False) :
    
    # INPUT  :  competitions - list of competition dictionaries (in order of importance)
    # competition = {'NAME'  :  , 'CAP'  :  , 'MVP'  :  }

    ### LOAD TRAINGING DATA AND MY PLAYERS ###
    data_train = pd.read_csv('training_data.csv')
    if limited :
        with open('my_players.txt', 'r') as file:
            my_players = file.read().splitlines()
    else :
        with open('my_players.txt', 'r') as file:
            my_players = file.read().splitlines()
    data_train['SORARE'] = data_train['SORARE'].astype(float)
    #############################

    selected_players = []

    ### COLLECT LIVE PLAYER DATA ###
    print('Collecting live player data ...')

    url = f'https://www.basketball-reference.com/leagues/NBA_2023_per_game.html'
    page = rqs.get(url)
    soup = BeautifulSoup(page.content, 'html')
    while soup.find('tr', class_ = 'thead') is not None :
        soup.find('tr', class_ = 'thead').decompose()
    table = soup.find('table')
    data_live = pd.read_html(str(table))[0][['Player', 'Tm']]
    hrefs = []
    for x in table.find_all('a', href = True):
        if x['href'].split('/')[1] == 'players' :
            hrefs.append(x['href'][:-5])
    data_live['href'] = hrefs
    data_live = data_live.groupby('Player').apply(single_team)
    data_live = data_live.reset_index(drop = True)
    #############################

    ### CREATE FEATURES FOR MY PLAYERS ###
    clear_output()
    print('Fetching my players game logs ...')

    my_data = data_live[data_live['Player'].isin(my_players)]
    if len(my_players) != len(my_data) :
        print('Player name not recognized')
    my_players_info = []
    for i, player, href in zip(range(len(my_data)), my_data['Player'], my_data['href']) :

        print(f'     ... {round(100*(i+1)/len(my_data), 1)}% {player}                 ', end = '')
        print('\r', end = '')

        ## SCRAPE PLAYER GAME LOGS ##
        url = f'https://www.basketball-reference.com{href}/gamelog/2023'
        page = rqs.get(url)
        soup = BeautifulSoup(page.content, 'html')
        while soup.find('tr', class_ = 'thead') is not None :
            soup.find('tr', class_ = 'thead').decompose()
        table = soup.find('table', id = 'pgl_basic')
        df = pd.read_html(str(table))[0][['Date', 'Tm', 'G', 'MP', 'PTS', 'TRB', 'AST', 'BLK', 'STL', 'TOV', '3P']]
        df.insert(0, 'Player', len(df) * [player])
        df = df.apply(pd.to_numeric, errors = 'ignore')
        df['SORARE'] = SoRareScore(df)
        df['Date'] = df['Date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
        df = df.sort_values('Date')
        df['SORARE'] = df['SORARE'].replace('dnp', np.nan)
        df['SORARE'] = df['SORARE'].astype(float)
        df['SORARE_round'] = df['SORARE'].apply(lambda x: x if np.isnan(x) else round(x))
        #############################

        ## MERGE PLAYER WITH ITS TEAM'S SCHEDULE ##
        tm = df['Tm'].iloc[-1]
        player_schedule = schedule[(schedule['home'] == tm) | (schedule['away'] == tm)]
        away_games = player_schedule.rename(columns = {'away' : 'Tm'}).merge(df, on = ['Date', 'Tm'], how = 'left')
        home_games = player_schedule.rename(columns = {'home' : 'Tm'}).merge(df, on = ['Date', 'Tm'], how = 'left')
        away_games = away_games[away_games['Tm'] == tm].rename(columns = {'home' : 'opp'})
        home_games = home_games[home_games['Tm'] == tm].rename(columns = {'away' : 'opp'})
        df = pd.concat([home_games, away_games]).sort_values('Date')
        df['Rest'] = [np.nan] + [(t1 - t0).days for t1, t0 in zip(df.Date[1:], df.Date[:-1])]
        df = df[df['Gameweek'] <= gameweek]
        df_past = df[df['Player'].notna()]
        df_pred = df[df['Player'].isna()]
        #############################

        ## CREATE MACHINE LEARNING FEATURES ##
        form_1 = df_past[df_past['SORARE'].notna()].SORARE.iloc[-1]
        form_2 = df_past['SORARE'].tail(2).mean()
        form_5 = df_past['SORARE'].tail(5).mean()
        form_10 = df_past['SORARE'].tail(10).mean()
        cost = df_past[(df_past['SORARE'].notna()) & (df_past['Gameweek'] < gameweek-1)].tail(10).SORARE.mean()
        form_25 = df_past['SORARE'].tail(25).mean()
        rest = df_pred.Rest.mean()
        stdev = df_past['SORARE'].std()
        features = ['form_1', 'form_2', 'form_5', 'form_10', 'mean_score', 'Rest']
        player_form = pd.DataFrame([[player, round(cost), stdev, gameweek, list(df_pred['opp']), len(df_pred), form_1, form_2, form_5, form_10, form_25, rest]],
                                                columns = ['Player', 'Cost', 'Std.', 'Gameweek', 'Against', 'No. of games'] + features)
        my_players_info.append(player_form)
        sleep(2)
        
    my_data = pd.concat(my_players_info).reset_index(drop = True)
    my_data['mean_score'] = my_data['mean_score'].replace(np.nan, 0)
    my_data['Rest'] = my_data['Rest'].replace(np.nan, 1.0)
    for w in [1, 2, 5, 10] :
        my_data[f'form_{w}'] = my_data[f'form_{w}'].fillna(my_data['mean_score'])
    #############################

    ### INJURIES ###
    injury_report = 'INJURY REPORT :'
    url = 'https://www.espn.co.uk/nba/injuries'
    page = rqs.get(url)
    soup = BeautifulSoup(page.content, 'html')
    tables = soup.find_all('table')
    injuries = pd.concat(pd.read_html(str(tables))).reset_index(drop = True)

    my_data['Name_merge'] = my_data['Player'].apply(lambda x: unidecode(x).lower().replace('.', '').replace('jr', '').replace('sr', '').replace('ii', '').replace('iii', ''))
    injuries['Name_merge'] = injuries['NAME'].apply(lambda x: unidecode(x).lower().replace('.', '').replace('jr', '').replace('sr', '').replace('ii', '').replace('iii', ''))
    my_data = my_data.merge(injuries, on = 'Name_merge', how = 'left')
    my_injuries = my_data[my_data['NAME'].notna()]
    my_data = my_data[my_data['NAME'].isna()]
    for player, status in zip(my_injuries['NAME'], my_injuries['STATUS']) :
        injury_report = injury_report + f' {player} ({status.upper()});'
    #############################


    ### COMPUTE MY PLAYERS PROJECTED SCORES ###
    clear_output()
    print('Predicting my players scores ... ')

    model = LinearRegression()
    model.fit(data_train[features], data_train['SORARE'])
    my_data = my_data.assign(pred_raw = model.predict(my_data[features]))
    my_data['multiplier'] = [multiplier(n, mean, std) for n, mean, std in zip(my_data['No. of games'], my_data['mean_score'], my_data['Std.'])]
    my_data['pred'] = my_data['multiplier'] * my_data['pred_raw']
    my_data = my_data.sort_values('Cost', ascending = False)
    sleep(2)
    #############################
 
    ### OPTIMISE TOTAL PROJECTED SCORE FOR ALL INPUT COMPETITIONS ###
    clear_output()
    selection = my_data.copy()
    print('Optimising competition lineups ... ')
    for competition in competitions:

        # Initiate maximization problem with PuLP
        prob = pulp.LpProblem('prob', pulp.LpMaximize)
        # Create decision variables : Is the player chosen ?
        x = pulp.LpVariable.dicts('x', my_data['Player'], lowBound = 0, upBound = 1, cat = pulp.LpInteger)
        var_x = list(x.values())

        # Objective function : Maximize the total predicted score
        prob += pulp.lpSum([x * PRED_SCORE for x, PRED_SCORE in zip(var_x, my_data['pred'])])
        prob += pulp.lpSum(var_x) == 5

        if competition.get('MVP'):
            # Add MVP decision variables
            y = pulp.LpVariable.dicts('mvp', my_data['Player'], lowBound = 0, upBound = 1, cat = pulp.LpInteger)
            var_y = list(y.values())
            # Constraints
            prob += pulp.lpSum([(x - y) * COST for x, y, COST in zip(var_x, var_y, my_data['Cost'])]) <= competition.get('CAP')
            prob += pulp.lpSum(var_y) == 1
            for vx, vy in zip(var_x, var_y) :
                prob += vx - vy >= 0

        else:
            # Budget constraint
            prob += pulp.lpSum([x * COST for x, COST in zip(var_x, my_data['Cost'])]) <= competition.get('CAP')

        # Solve optimisation problem
        status = prob.solve(pulp.PULP_CBC_CMD(msg = False))
        
        # Save selected players list
        sel_players = []
        for d in  x.items():
            if d[1].varValue == 1.0 :
                sel_players.append(d[0])

        my_data = my_data[~my_data['Player'].isin(sel_players)] # Get rid of already selected players
        selected_players.append(sel_players)
        #############################

    for competition, players in zip(competitions, selected_players) :

        comp_df = selection[selection['Player'].isin(players)]
        print(f"\nCompetition : {competition.get('NAME')}, Projected points : {round(comp_df.pred.sum(), 1)}")
        for player, cost, pred in zip(comp_df['Player'], comp_df['Cost'], comp_df['pred']) :
            print(f'     {player.upper()} ({cost}) : {round(pred, 1)} pts')

    #selection = selection[selection['Player'].isin(selected_players)]
    print('\n' + injury_report)

    return selection

In [7]:
common_contender =  {  'NAME' :  'COMMON CONTENDER',
                       'CAP'  :  106,
                       'MVP'  :  False}

common_champion =   {  'NAME' :  'COMMON CHAMPION',
                       'CAP'  :  120,
                       'MVP'  :  True}

offense = {  'NAME' :  'LIMITED CONTENDER',
                       'CAP'  :  120,
                       'MVP'  :  True}                      
                                           
        
competitions = [common_champion, common_contender, offense]
#competitions = [limited_contender]

gameweek = 21

selection = my_teams(gameweek, competitions, limited = False)

Optimising competition lineups ... 

Competition : COMMON CHAMPION, Projected points : 233.2
     JOEL EMBIID (57) : 65.0 pts
     JA MORANT (42) : 51.7 pts
     DEMAR DEROZAN (41) : 53.5 pts
     JONAS VALANČIŪNAS (26) : 38.1 pts
     ANDRE DRUMMOND (11) : 25.0 pts

Competition : COMMON CONTENDER, Projected points : 156.9
     LAURI MARKKANEN (41) : 49.1 pts
     BROOK LOPEZ (27) : 37.5 pts
     NAZ REID (25) : 33.8 pts
     JAXSON HAYES (8) : 23.7 pts
     KZ OKPALA (5) : 12.7 pts

Competition : LIMITED CONTENDER, Projected points : 214.9
     NIKOLA JOKIĆ (64) : 62.9 pts
     LUKA DONČIĆ (55) : 62.7 pts
     JERAMI GRANT (31) : 38.6 pts
     IMMANUEL QUICKLEY (21) : 29.6 pts
     WILLY HERNANGÓMEZ (13) : 21.2 pts

INJURY REPORT : Bruce Brown (DAY-TO-DAY); De'Andre Hunter (DAY-TO-DAY); Devin Booker (OUT); Fred VanVleet (DAY-TO-DAY); Jimmy Butler (DAY-TO-DAY); Josh Green (OUT); Khris Middleton (OUT); Larry Nance Jr. (DAY-TO-DAY); Maxi Kleber (OUT); Ousmane Dieng (OUT);


In [23]:
selection[['Player', 'Cost', 'Against', 'No. of games', 'form_2', 'mean_score', 'pred']]

Unnamed: 0,Player,Cost,Against,No. of games,form_2,mean_score,pred
5,Keldon Johnson,34,"[UTA, OKC, NYK]",3,28.8,29.938095,38.907152
9,Norman Powell,27,"[DET, TOR, BOS]",3,23.6,26.113333,34.614273
10,Rui Hachimura,20,"[PHI, PHO]",2,25.95,21.911111,29.149547
2,Isaiah Hartenstein,16,"[DAL, SAS]",2,9.3,15.084,19.115017
7,Keon Johnson,11,[CHO],1,10.35,9.311111,11.549302
8,Mark Williams,9,"[POR, GSW, OKC]",3,0.0,0.0,10.482695
1,Greg Brown III,6,[CHO],1,6.366667,6.366667,7.385064
4,Josh Christopher,5,"[CHI, BOS, DAL]",3,1.5,4.638462,11.296334
0,Dalen Terry,3,"[HOU, MIL]",2,1.2,3.45,7.977112
3,Isaiah Mobley,3,"[BRK, IND]",2,3.0,3.0,7.352695


In [24]:
data_train = pd.read_csv('training_data.csv')