In [226]:
import pandas as pd
import numpy as np
import datetime
from bs4 import BeautifulSoup
import requests as rqs
from sklearn.linear_model import LinearRegression

In [87]:
def single_team(df) :
    if len(df) == 1: # If player played for a single team, do nothing
        return df
    else: # If player has moved, return total stats with last team
        row = df[df['Tm'] == 'TOT'].copy()
        row['Tm'] = [str(df['Tm'].iloc[-1])]
        return row

def SoRareScore(df):

    df = df.apply(pd.to_numeric, errors = 'ignore')
    scores = []
    for pts, reb, ast, blk, stl, tov, fg_3 in zip(df['PTS'], df['TRB'], df['AST'], df['BLK'], df['STL'], df['TOV'], df['3P']) :
        
        score = 0
        if pts not in ['Inactive', 'Did Not Play', 'Did Not Dress', 'Not With Team', 'Player Suspended']:
            
            pts, reb, ast, blk, stl, tov, fg_3 = int(pts), int(reb), int(ast), int(blk), int(stl), int(tov), int(fg_3)

            score = 1 * pts + 1.2 * reb + 1.5 * ast + 3 * blk + 3 * stl + 1 * fg_3 - 2 * tov

            doubles = 0 # Count double-digits stats
            if pts >= 10 :
                doubles += 1
            if reb >= 10 :
                doubles += 1
            if ast >= 10 :
                doubles += 1
            if blk >= 10 :
                doubles += 1
            if stl >= 10 :
                doubles += 1

            if doubles == 2 : # Double-double
                score += 1

            if doubles >= 3 : # Tripe-double
                score += 2
        
        else :
            score = 'dnp'

        scores.append(score)
    return scores

def mins_played(str) :
    min, sec = str.split(':')
    return float(min) + float(sec) / 60

In [88]:
# Get list of player hrefs
url = f'https://www.basketball-reference.com/leagues/NBA_2023_per_game.html'
page = rqs.get(url)
soup = BeautifulSoup(page.content, 'html')
while soup.find('tr', class_ = 'thead') is not None :
    soup.find('tr', class_ = 'thead').decompose()

table = soup.find('table')
data_live = pd.read_html(str(table))[0][['Player', 'Tm', 'PTS']]
hrefs = []
for x in table.find_all('a', href = True):
    if x['href'].split('/')[1] == 'players' :
        hrefs.append(x['href'][:-5])
data_live['href'] = hrefs
data_live = data_live.groupby('Player').apply(single_team)
data_live = data_live.sort_values('PTS', ascending = False).reset_index(drop = True)

data_live.head(10)

Unnamed: 0,Player,Tm,PTS,href
0,Luka Dončić,DAL,33.5,/players/d/doncilu01
1,Jayson Tatum,BOS,31.6,/players/t/tatumja01
2,Stephen Curry,GSW,31.4,/players/c/curryst01
3,Giannis Antetokounmpo,MIL,31.3,/players/a/antetgi01
4,Shai Gilgeous-Alexander,OKC,31.1,/players/g/gilgesh01
5,Joel Embiid,PHI,31.1,/players/e/embiijo01
6,Kevin Durant,BRK,30.4,/players/d/duranke01
7,Devin Booker,PHO,29.0,/players/b/bookede01
8,Donovan Mitchell,CLE,28.4,/players/m/mitchdo01
9,Ja Morant,MEM,28.2,/players/m/moranja01


In [101]:
months = ['october', 'november', 'december', 'january', 'february', 'march', 'april']

monthly_schedule_list = []
for month in months:
    url = f'https://www.basketball-reference.com/leagues/NBA_2023_games-{month}.html'
    page = rqs.get(url)
    soup = BeautifulSoup(page.content, 'html')
    while soup.find('tr', class_ = 'thead') is not None :
        soup.find('tr', class_ = 'thead').decompose()

    table = soup.find('table')
    monthly_schedule = pd.read_html(str(table))[0]

    home_tm, away_tm = [], []
    for i, x in enumerate(table.find_all('a', href = True), start = 1):
        if x['href'].split('/')[1] == 'teams' :
            if i % 2 == 1 :
                home_tm.append((x['href'][7:10]))
            else :
                away_tm.append((x['href'][7:10]))

    monthly_schedule = monthly_schedule.assign(home = home_tm, away = away_tm)[['Date', 'home', 'away']]
    monthly_schedule['Date'] = monthly_schedule['Date'].apply(lambda x: datetime.datetime.strptime(x, '%a, %b %d, %Y'))
    monthly_schedule_list.append(monthly_schedule)

schedule = pd.concat(monthly_schedule_list).reset_index(drop = True)

gw = 1
dates, gws = [], []
date = datetime.datetime(2022, 10, 21)
while date < datetime.datetime(2023, 5, 10):
    dates.append(date)
    gws.append(gw)
    if date.weekday() in [3, 6]:
        gw = gw + 1

    date = date + datetime.timedelta(days = 1)

gameweeks = pd.DataFrame(zip(dates, gws), columns = ['Date', 'Gameweek'])
schedule = schedule.merge(gameweeks, on = 'Date')

schedule.head()

Unnamed: 0,Date,home,away,Gameweek
0,2022-10-21,CHO,NOP,1
1,2022-10-21,IND,SAS,1
2,2022-10-21,WAS,CHI,1
3,2022-10-21,ATL,ORL,1
4,2022-10-21,BRK,TOR,1


In [286]:
features = ['form_1', 'form_2', 'form_5', 'form_10', 'mean_score', 'Rest']

def my_teams(my_players, gameweek, data_train) :

    my_data = data_live[data_live['Player'].isin(my_players)]

    my_players_info = []
    for player, href in zip(my_data['Player'], my_data['href']) :

        url = f'https://www.basketball-reference.com{href}/gamelog/2023'

        page = rqs.get(url)
        soup = BeautifulSoup(page.content, 'html')
        while soup.find('tr', class_ = 'thead') is not None :
            soup.find('tr', class_ = 'thead').decompose()

        table = soup.find('table', id = 'pgl_basic')
        df = pd.read_html(str(table))[0][['Date', 'Tm', 'G', 'MP', 'PTS', 'TRB', 'AST', 'BLK', 'STL', 'TOV', '3P']]
        df.insert(0, 'Player', len(df) * [player])
        df = df.apply(pd.to_numeric, errors = 'ignore')
        df['SORARE'] = SoRareScore(df)
        df['Date'] = df['Date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
        df = df.sort_values('Date')
        df['SORARE'] = df['SORARE'].replace('dnp', np.nan)
        #df['Rest'] = [np.nan] + [(t1 - t0).days for t1, t0 in zip(df.Date[1:], df.Date[:-1])]

        tm = df['Tm'].iloc[-1]
        player_schedule = schedule[(schedule['home'] == tm) | (schedule['away'] == tm)]

        away_games = player_schedule.rename(columns = {'away' : 'Tm'}).merge(df, on = ['Date', 'Tm'], how = 'left')
        home_games = player_schedule.rename(columns = {'home' : 'Tm'}).merge(df, on = ['Date', 'Tm'], how = 'left')

        away_games = away_games[away_games['Tm'] == tm].rename(columns = {'home' : 'opp'})
        home_games = home_games[home_games['Tm'] == tm].rename(columns = {'away' : 'opp'})

        
        df = pd.concat([home_games, away_games]).sort_values('Date')

        df['Rest'] = [np.nan] + [(t1 - t0).days for t1, t0 in zip(df.Date[1:], df.Date[:-1])]
        df = df[df['Gameweek'] <= gameweek]
        df_past = df[df['Player'].notna()]
        df_pred = df[df['Player'].isna()]

        form_1 = df_past['SORARE'].tail(1).mean()
        form_2 = df_past['SORARE'].tail(2).mean()
        form_5 = df_past['SORARE'].tail(5).mean()
        form_10 = df_past['SORARE'].tail(10).mean()
        df_past['cost'] = 
        form_25 = df_past['SORARE'].tail(25).mean()
        rest = df_pred.Rest.mean()
        
        player_form = pd.DataFrame([[player, cost, gameweek, list(df_pred['opp']), len(df_pred), form_1, form_2, form_5, form_10, form_25, rest]],
                                                columns = ['Player', 'Cost', 'Gameweek', 'Against', 'No. of games'] + features)

        my_players_info.append(player_form)
        
    my_data = pd.concat(my_players_info).reset_index(drop = True)


    model = LinearRegression()
    model.fit(data_train[features], data_train['SORARE'])

    my_data = my_data.assign(pred_raw = model.predict(my_data[features]))

    multiplier = {1: 1.0, 2: 1.15, 3: 1.25, 4: 1.325}
    my_data['pred'] = [pred * multiplier.get(no_games) for pred, no_games in zip(my_data['pred_raw'], my_data['No. of games'])]

    return my_data

SyntaxError: invalid syntax (4147138103.py, line 48)

In [287]:
data_train = pd.read_csv('training_data.csv')
with open('my_players.txt', 'r') as file:
    my_players = file.read().splitlines()

gameweek = 13

my_teams(my_players, gameweek, data_train)

         Date   Tm  opp  Gameweek        Player     G             MP  \
0  2022-10-21  BOS  MIA         1  Jaylen Brown   2.0          34:01   
1  2022-10-22  BOS  ORL         1  Jaylen Brown   3.0          37:30   
2  2022-10-24  BOS  CHI         2  Jaylen Brown   4.0          35:23   
3  2022-10-28  BOS  CLE         3  Jaylen Brown   5.0          42:48   
4  2022-10-30  BOS  WAS         3  Jaylen Brown   6.0          29:58   
5  2022-11-02  BOS  CLE         4  Jaylen Brown   7.0          44:36   
6  2022-11-04  BOS  CHI         5  Jaylen Brown   8.0          33:50   
7  2022-11-05  BOS  NYK         5  Jaylen Brown   9.0          37:19   
8  2022-11-07  BOS  MEM         6  Jaylen Brown  10.0          38:16   
9  2022-11-09  BOS  DET         6  Jaylen Brown  11.0          30:01   
10 2022-11-11  BOS  DEN         7  Jaylen Brown  12.0          34:47   
11 2022-11-12  BOS  DET         7  Jaylen Brown   NaN  Did Not Dress   
12 2022-11-14  BOS  OKC         8  Jaylen Brown  13.0          4

ValueError: cannot convert float NaN to integer

In [222]:
r_12, r_13, r_14 = [], [], []
for i in range(10 ** 7) :

    score_1 = np.random.normal(30, 10)
    score_2 = np.max([np.random.normal(30, 10), np.random.normal(30, 10)])
    score_3 = np.max([np.random.normal(30, 10), np.random.normal(30, 10), np.random.normal(30, 10)])
    score_4 = np.max([np.random.normal(30, 10), np.random.normal(30, 10), np.random.normal(30, 10), np.random.normal(30, 10)])

    r_12.append(score_2 / score_1)
    r_13.append(score_3 / score_1)
    r_14.append(score_4 / score_1)

print(f'On average : score(2) / score(1) = {np.median(r_12)}')
print(f'On average : score(3) / score(1) = {np.median(r_13)}')
print(f'On average : score(4) / score(1) = {np.median(r_14)}')

On average : score(2) / score(1) = 1.1845246517399017
On average : score(3) / score(1) = 1.277892824033831
On average : score(4) / score(1) = 1.339239768467373
