In [2]:
pwd

'c:\\Users\\nicol\\Documents\\GitHub\\basketball-analysis'

In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
from IPython.display import display

In [4]:
def display_full(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(df)

In [None]:
def drop_box_score(df):
    return df.drop(columns='Unnamed: 6')

def rename_ot(df):
    df = df.rename(columns={'Unnamed: 7': 'OT'})
    df['OT'] = df['OT'].replace('OT', '1OT').replace(r'(\d)OT', r'\1', regex=True).fillna(0)
    return df

def transform_dates(df):
    df['Date'] = pd.to_datetime(df['Date'])
    return df

def attendance_to_int(df):
    df = df.rename(columns={'Attend.': 'Attendance'})
    df['Attendance'] = df['Attendance'].astype(int)
    return df

def log_to_minutes(df):
    df['LOG'] = (df['LOG'].str[0].astype(int) * 60) + (df['LOG'].str[-2:].astype(int))
    return df
 
def preprocess(df):
    return (df.pipe(drop_box_score)
            .pipe(rename_ot)
            .pipe(transform_dates)
            .pipe(attendance_to_int)
            # .pipe(log_to_minutes) LOG column contains nan values
           )

In [65]:
october = pd.read_csv('2024-reg-season/october-schedule.txt')
october = october.rename(columns={'VisitorNeutral': 'Visitor/Neutral', 'HomeNeutral': 'Home/Neutral'})
october['Start (ET)'] = october['Start (ET)'].str.replace(r'(\d{1,2})(\d{2})(.)', r'\1:\2\3', regex=True)
october['LOG'] = october['LOG'].astype(str).str.replace(r'(\d)(\d{2})', r'\1:\2', regex=True)

In [66]:
october = preprocess(october)
november = preprocess(pd.read_csv('2024-reg-season/november-schedule.txt'))
december = preprocess(pd.read_csv('2024-reg-season/december-schedule.txt'))
january = preprocess(pd.read_csv('2024-reg-season/january-schedule.txt'))
february = preprocess(pd.read_csv('2024-reg-season/february-schedule.txt'))
march = preprocess(pd.read_csv('2024-reg-season/march-schedule.txt'))
april = preprocess(pd.read_csv('2024-reg-season/april-schedule.txt'))

In [67]:
schedule = pd.DataFrame()
for df in [october, november, december, january, february, march, april]:
    schedule = pd.concat([schedule, df]).reset_index(drop=True)
schedule.head()

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,OT,Attendance,LOG,Arena,Notes
0,2024-10-22,7:30p,New York Knicks,109,Boston Celtics,132,0,19156,2:04,TD Garden,
1,2024-10-22,10:00p,Minnesota Timberwolves,103,Los Angeles Lakers,110,0,18997,2:26,Crypto.com Arena,
2,2024-10-23,7:00p,Indiana Pacers,115,Detroit Pistons,109,0,20062,2:23,Little Caesars Arena,
3,2024-10-23,7:30p,Brooklyn Nets,116,Atlanta Hawks,120,0,17548,2:33,State Farm Arena,
4,2024-10-23,7:30p,Orlando Magic,116,Miami Heat,97,0,19630,2:31,Kaseya Center,


In [32]:
team_abbrev = {
    "Atlanta Hawks": "ATL",
    "Boston Celtics": "BOS",
    "Brooklyn Nets": "BKN",
    "Charlotte Hornets": "CHA",
    "Chicago Bulls": "CHI",
    "Cleveland Cavaliers": "CLE",
    "Dallas Mavericks": "DAL",
    "Denver Nuggets": "DEN",
    "Detroit Pistons": "DET",
    "Golden State Warriors": "GSW",
    "Houston Rockets": "HOU",
    "Indiana Pacers": "IND",
    "Los Angeles Clippers": "LAC",
    "Los Angeles Lakers": "LAL",
    "Memphis Grizzlies": "MEM",
    "Miami Heat": "MIA",
    "Milwaukee Bucks": "MIL",
    "Minnesota Timberwolves": "MIN",
    "New Orleans Pelicans": "NOP",
    "New York Knicks": "NYK",
    "Oklahoma City Thunder": "OKC",
    "Orlando Magic": "ORL",
    "Philadelphia 76ers": "PHI",
    "Phoenix Suns": "PHX",
    "Portland Trail Blazers": "POR",
    "Sacramento Kings": "SAC",
    "San Antonio Spurs": "SAS",
    "Toronto Raptors": "TOR",
    "Utah Jazz": "UTA",
    "Washington Wizards": "WAS"
}

In [33]:
schedule['Visitor/Neutral'] = schedule['Visitor/Neutral'].replace(team_abbrev)
schedule['Home/Neutral'] = schedule['Home/Neutral'].replace(team_abbrev)
schedule.head()

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,OT,Attendance,LOG,Arena,Notes
0,2024-10-22,7:30p,NYK,109,BOS,132,0,19156,2:04,TD Garden,
1,2024-10-22,10:00p,MIN,103,LAL,110,0,18997,2:26,Crypto.com Arena,
2,2024-10-23,7:00p,IND,115,DET,109,0,20062,2:23,Little Caesars Arena,
3,2024-10-23,7:30p,BKN,116,ATL,120,0,17548,2:33,State Farm Arena,
4,2024-10-23,7:30p,ORL,116,MIA,97,0,19630,2:31,Kaseya Center,


In [34]:
teams = list(schedule['Home/Neutral'].unique())
len(teams)

30

In [37]:
team_schedules = {}
for team in teams:
    team_df = schedule[(schedule['Visitor/Neutral'] == team) | (schedule['Home/Neutral'] == team)]
    team_schedules[team] = team_df
team_schedules['NYK']

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,OT,Attendance,LOG,Arena,Notes
0,2024-10-22,7:30p,NYK,109,BOS,132,0,19156,2:04,TD Garden,
20,2024-10-25,7:30p,IND,98,NYK,123,0,19812,2:21,Madison Square Garden (IV),
46,2024-10-28,7:30p,CLE,110,NYK,104,0,19812,2:23,Madison Square Garden (IV),
61,2024-10-30,7:30p,NYK,116,MIA,107,0,19620,2:10,Kaseya Center,
73,2024-11-01,7:00p,NYK,128,DET,98,0,17022,1:55,Little Caesars Arena,
...,...,...,...,...,...,...,...,...,...,...,...
1170,2025-04-06,7:00p,PHX,98,NYK,112,0,19812,2:14,Madison Square Garden (IV),
1181,2025-04-08,7:30p,BOS,119,NYK,117,1,19812,2:42,Madison Square Garden (IV),
1196,2025-04-10,7:00p,NYK,106,DET,115,0,20062,2:16,Little Caesars Arena,
1205,2025-04-11,7:30p,CLE,108,NYK,102,0,19812,2:18,Madison Square Garden (IV),


In [42]:
avg_rest_between_games = {}
for team, team_schedule in team_schedules.items():
    avg_rest = team_schedule['Date'].diff().mean()
    avg_rest_between_games[team] = avg_rest
avg_rest_between_games

{'BOS': Timedelta('2 days 03:15:33.333333333'),
 'LAL': Timedelta('2 days 03:15:33.333333333'),
 'DET': Timedelta('2 days 02:57:46.666666666'),
 'ATL': Timedelta('2 days 02:57:46.666666666'),
 'MIA': Timedelta('2 days 02:57:46.666666666'),
 'PHI': Timedelta('2 days 02:57:46.666666666'),
 'TOR': Timedelta('2 days 02:57:46.666666666'),
 'HOU': Timedelta('2 days 02:57:46.666666666'),
 'NOP': Timedelta('2 days 02:57:46.666666666'),
 'UTA': Timedelta('2 days 02:57:46.666666666'),
 'LAC': Timedelta('2 days 02:57:46.666666666'),
 'POR': Timedelta('2 days 02:57:46.666666666'),
 'WAS': Timedelta('2 days 02:40:00'),
 'DAL': Timedelta('2 days 02:40:00'),
 'SAC': Timedelta('2 days 02:40:00'),
 'DEN': Timedelta('2 days 02:40:00'),
 'ORL': Timedelta('2 days 02:57:46.666666666'),
 'CLE': Timedelta('2 days 02:57:46.666666666'),
 'NYK': Timedelta('2 days 03:15:33.333333333'),
 'MIL': Timedelta('2 days 02:20:29.268292682'),
 'CHA': Timedelta('2 days 02:57:46.666666666'),
 'MEM': Timedelta('2 days 02:57:

In [43]:
avg_attendances = {}
for team, team_schedule in team_schedules.items():
    avg_attendance = team_schedule['Attendance'].mean()
    avg_attendances[team] = avg_attendance
avg_attendances

{'BOS': np.float64(18838.475609756097),
 'LAL': np.float64(18759.426829268294),
 'DET': np.float64(18543.817073170732),
 'ATL': np.float64(17284.560975609755),
 'MIA': np.float64(18958.09756097561),
 'PHI': np.float64(18858.89024390244),
 'TOR': np.float64(18387.98780487805),
 'HOU': np.float64(17745.01219512195),
 'NOP': np.float64(17384.060975609755),
 'UTA': np.float64(17936.451219512193),
 'LAC': np.float64(17389.256097560974),
 'POR': np.float64(17676.085365853658),
 'WAS': np.float64(17161.90243902439),
 'DAL': np.float64(19052.231707317074),
 'SAC': np.float64(17459.939024390245),
 'DEN': np.float64(18939.682926829268),
 'ORL': np.float64(18296.853658536584),
 'CLE': np.float64(18936.939024390245),
 'NYK': np.float64(19132.70731707317),
 'MIL': np.float64(17889.67469879518),
 'CHA': np.float64(17741.60975609756),
 'MEM': np.float64(17395.89024390244),
 'MIN': np.float64(18497.158536585364),
 'CHI': np.float64(19182.719512195123),
 'SAS': np.float64(17988.073170731706),
 'PHX': n

In [47]:
px.scatter(schedule, x='Attendance', y='PTS')