# Feature Engineering
We're going to get last-15-game means for each player and we're going to derive mins_share and mins_proj features.<br>

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os
import requests
import sqlite3
import re
import matplotlib.pyplot as plt

# Get the parent directory where config.py is located
#sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

API_KEY = None
API_HOST = None

# if API_KEY is None or API_HOST is None:
#     raise ValueError("API_KEY and API_HOST must be provided when running via Papermill.")

In [2]:
#Which site are we playing?
site = input("Do you want to play FanDuel or DraftKings?").lower()
if site == 'fanduel':
    positions = ['PG', 'SG', 'SF', 'PF', 'C']
else:
    positions = ['PG', 'SG', 'SF', 'PF', 'C', 'G', 'F', 'UTIL']

Do you want to play FanDuel or DraftKings? fanduel


In [6]:
from config import API_KEY, API_HOST

headers = {
    "x-rapidapi-key": API_KEY,
    "x-rapidapi-host": API_HOST
}

In [7]:
today = (datetime.now()).strftime('%Y%m%d')

In [8]:
# Read data from the database
conn = sqlite3.connect("../nba_dfs_model.db")
main_df = pd.read_sql_query("SELECT * FROM game_stats", conn)
conn.close()

# Rename 'PF' to 'fouls' to avoid conflicts with the Power Forward position later
main_df = main_df.rename(columns={'PF': 'fouls'})

# conn = sqlite3.connect("../nba_dfs_model.db")
# main_df = pd.read_sql_query("SELECT * FROM game_stats", conn)
# conn.close()
# main_df.head()

In [9]:
len(main_df)

53262

In [10]:
# Check for duplicates in the game_id and player_id combination
duplicates = main_df.duplicated(subset=["game_id", "player_id"])

# Filter the duplicates
duplicate_rows = main_df[duplicates]


In [11]:
duplicate_rows

Unnamed: 0,longName,game_id,player_id,team_id,team,teamAbv,fga,ast,tptfgm,fgm,...,ftm,blk,DefReb,plusMinus,stl,pts,fouls,TOV,usage,mins


In [12]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53262 entries, 0 to 53261
Data columns (total 23 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   longName   53262 non-null  object 
 1   game_id    53262 non-null  object 
 2   player_id  53262 non-null  object 
 3   team_id    53262 non-null  object 
 4   team       53262 non-null  object 
 5   teamAbv    53262 non-null  object 
 6   fga        53262 non-null  object 
 7   ast        53262 non-null  object 
 8   tptfgm     53262 non-null  object 
 9   fgm        53262 non-null  object 
 10  fta        53262 non-null  object 
 11  tptfga     53262 non-null  object 
 12  OffReb     53262 non-null  object 
 13  ftm        53262 non-null  object 
 14  blk        53262 non-null  object 
 15  DefReb     53262 non-null  object 
 16  plusMinus  53262 non-null  object 
 17  stl        53262 non-null  object 
 18  pts        53262 non-null  object 
 19  fouls      53262 non-null  object 
 20  TOV   

In [13]:
#Deriving a date column, and then filtering out anything before Jan. 1, 2024
main_df['date'] = pd.to_datetime(main_df['game_id'].str[:8])

In [14]:
main_df = main_df[main_df['date'] >= '2024-01-01']

In [15]:
main_df.columns

Index(['longName', 'game_id', 'player_id', 'team_id', 'team', 'teamAbv', 'fga',
       'ast', 'tptfgm', 'fgm', 'fta', 'tptfga', 'OffReb', 'ftm', 'blk',
       'DefReb', 'plusMinus', 'stl', 'pts', 'fouls', 'TOV', 'usage', 'mins',
       'date'],
      dtype='object')

In [16]:
#Establishing numeric columns
num_cols = ['fga', 'ast', 'tptfgm', 'fgm', 'fta', 'tptfga', 'OffReb', 'ftm', 'blk', 'DefReb', 'plusMinus', 'stl', 'pts', 'fouls', 'TOV', 'usage', 'mins']

In [17]:
main_df[num_cols] = main_df[num_cols].apply(pd.to_numeric, errors = 'coerce')

In [18]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34592 entries, 18670 to 53261
Data columns (total 24 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   longName   34592 non-null  object        
 1   game_id    34592 non-null  object        
 2   player_id  34592 non-null  object        
 3   team_id    34592 non-null  object        
 4   team       34592 non-null  object        
 5   teamAbv    34592 non-null  object        
 6   fga        34592 non-null  int64         
 7   ast        34592 non-null  int64         
 8   tptfgm     34592 non-null  int64         
 9   fgm        34592 non-null  int64         
 10  fta        34592 non-null  int64         
 11  tptfga     34592 non-null  int64         
 12  OffReb     34592 non-null  int64         
 13  ftm        34592 non-null  int64         
 14  blk        34592 non-null  int64         
 15  DefReb     34592 non-null  int64         
 16  plusMinus  34592 non-null  int64         

In [19]:
#Sorting by player_id, then date, then game
main_df_sorted = main_df.sort_values(by = ['player_id', 'date', 'game_id']).reset_index(drop = True)

In [20]:
main_df_sorted.head()

Unnamed: 0,longName,game_id,player_id,team_id,team,teamAbv,fga,ast,tptfgm,fgm,...,blk,DefReb,plusMinus,stl,pts,fouls,TOV,usage,mins,date
0,Brook Lopez,20240101_IND@MIL,28006619932,17,MIL,MIL,17,1,2,8,...,4,4,6,0,21,3,0,20.53,38,2024-01-01
1,Brook Lopez,20240103_MIL@IND,28006619932,17,MIL,MIL,11,1,2,6,...,0,2,-22,0,16,3,2,20.62,28,2024-01-03
2,Brook Lopez,20240104_MIL@SA,28006619932,17,MIL,MIL,10,1,1,4,...,3,4,8,0,9,4,4,17.81,33,2024-01-04
3,Brook Lopez,20240106_MIL@HOU,28006619932,17,MIL,MIL,9,0,1,3,...,1,5,11,1,7,2,0,13.21,31,2024-01-06
4,Brook Lopez,20240108_UTA@MIL,28006619932,17,MIL,MIL,11,3,3,5,...,2,5,-9,0,13,2,0,12.48,38,2024-01-08


In [21]:
main_df_sorted.tail()

Unnamed: 0,longName,game_id,player_id,team_id,team,teamAbv,fga,ast,tptfgm,fgm,...,blk,DefReb,plusMinus,stl,pts,fouls,TOV,usage,mins,date
34587,Matthew Hurt,20240306_MEM@PHI,94994462027,15,MEM,MEM,2,0,0,0,...,0,0,-3,0,0,0,0,13.78,6,2024-03-06
34588,Matthew Hurt,20240308_ATL@MEM,94994462027,15,MEM,MEM,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,2024-03-08
34589,Matthew Hurt,20240318_MEM@SAC,94994462027,15,MEM,MEM,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,2024-03-18
34590,Matthew Hurt,20240320_MEM@GS,94994462027,15,MEM,MEM,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,2024-03-20
34591,Matthew Hurt,20240322_MEM@SA,94994462027,15,MEM,MEM,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,2024-03-22


In [22]:
len(main_df_sorted)

34592

In [23]:
#api_names = main_df_sorted[['longName', 'player_id', 'team', 'date']]
# Keep only rows that are not duplicated across the specified columns
# We need to do this because there are some data points that are duplicated,
# possibly the NBA Cup games
main_df_sorted = main_df_sorted[~main_df_sorted.duplicated(subset=['longName', 'player_id', 'team', 'date'], keep='first')]

In [24]:
len(main_df_sorted) #Let's check and see if this always takes 26 rows off dataset

34566

# Getting DFS information
This cell calls the API to get daily fantasy salaries and all eligible positions for each player, depending on the site we're playing. This eliminates the need for name matching.

In [25]:
heads = ['prim_pos', 'pos', 'salary', 'longName', 'player_id', 'team_id', 'team']
players = []
url = f"https://tank01-fantasy-stats.p.rapidapi.com/getNBADFS?date={today}"

headers = {
    "x-rapidapi-key": API_KEY,
    "x-rapidapi-host": API_HOST
}

try:
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raise an error for HTTP codes 4xx/5xx
    result = response.json()

    if 'body' in result and result['body']:
        for player in result['body'][site]:
            players.append([player['pos'], player['allValidPositions'], player['salary'], player['longName'],\
                            player['playerID'], player['teamID'], player['team']])
    else:
        no_game_dates.append(current_date)  # Track dates with no games
except requests.exceptions.RequestException as e:
    print(f"Error fetching data for {current_date}: {e}")
except KeyError:
    print(f"Unexpected response format for {current_date}: {result}")

In [26]:
#need teamID, team, salary, playerID, longNAme, allValidPositions

In [27]:
#Starting a df for the data points that will be used to predict DFS points for the current day
today_df = pd.DataFrame(players)

In [28]:
today_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,C,[C],12900,Nikola Jokic,28908111729,8,DEN
1,C,"[PF, C]",12000,Victor Wembanyama,943740414489,27,SA
2,SG,[PG],11600,Shai Gilgeous-Alexander,28778646789,21,OKC
3,C,"[C, PF]",11000,Anthony Davis,28368759882,7,DAL
4,SF,"[PF, SF]",10700,LeBron James,2871601440,14,LAL


In [29]:
today_df.columns = heads

In [30]:
for position in positions:
    today_df[position] = 0

In [31]:
#Binaries for position columns
today_df.head()

Unnamed: 0,prim_pos,pos,salary,longName,player_id,team_id,team,PG,SG,SF,PF,C
0,C,[C],12900,Nikola Jokic,28908111729,8,DEN,0,0,0,0,0
1,C,"[PF, C]",12000,Victor Wembanyama,943740414489,27,SA,0,0,0,0,0
2,SG,[PG],11600,Shai Gilgeous-Alexander,28778646789,21,OKC,0,0,0,0,0
3,C,"[C, PF]",11000,Anthony Davis,28368759882,7,DAL,0,0,0,0,0
4,SF,"[PF, SF]",10700,LeBron James,2871601440,14,LAL,0,0,0,0,0


In [32]:
for position in positions:
    today_df[position] = today_df['pos'].apply(lambda x: True if position in x else False)

In [33]:
today_df.head()

Unnamed: 0,prim_pos,pos,salary,longName,player_id,team_id,team,PG,SG,SF,PF,C
0,C,[C],12900,Nikola Jokic,28908111729,8,DEN,False,False,False,False,True
1,C,"[PF, C]",12000,Victor Wembanyama,943740414489,27,SA,False,False,False,True,True
2,SG,[PG],11600,Shai Gilgeous-Alexander,28778646789,21,OKC,True,False,False,False,False
3,C,"[C, PF]",11000,Anthony Davis,28368759882,7,DAL,False,False,False,True,True
4,SF,"[PF, SF]",10700,LeBron James,2871601440,14,LAL,False,False,True,True,False


In [34]:
if site == 'draftkings':
    today_df['G'] =  np.where(today_df['PG'] + today_df['SG'] > 0, True, False)
    today_df['F'] =  np.where(today_df['SF'] + today_df['PF'] > 0, True, False)
    today_df['UTIL'] = True

In [35]:
today_df.head(10)

Unnamed: 0,prim_pos,pos,salary,longName,player_id,team_id,team,PG,SG,SF,PF,C
0,C,[C],12900,Nikola Jokic,28908111729,8,DEN,False,False,False,False,True
1,C,"[PF, C]",12000,Victor Wembanyama,943740414489,27,SA,False,False,False,True,True
2,SG,[PG],11600,Shai Gilgeous-Alexander,28778646789,21,OKC,True,False,False,False,False
3,C,"[C, PF]",11000,Anthony Davis,28368759882,7,DAL,False,False,False,True,True
4,SF,"[PF, SF]",10700,LeBron James,2871601440,14,LAL,False,False,True,True,False
5,PG,[PG],10500,Luka Doncic,28398804489,14,LAL,True,False,False,False,False
6,PF,"[PF, SF]",10000,Jayson Tatum,28628646399,2,BOS,False,False,True,True,False
7,SG,"[SG, SF]",9900,Anthony Edwards,94344202027,18,MIN,False,True,True,False,False
8,C,"[C, PF]",9800,Domantas Sabonis,28118309129,26,SAC,False,False,False,True,True
9,PG,[PG],9800,Trae Young,28978646789,1,ATL,True,False,False,False,False


In [36]:
today_df = today_df.drop(columns = ['pos'])

In [37]:
today_df['date'] = pd.to_datetime(today)

In [38]:
today_df.head()

Unnamed: 0,prim_pos,salary,longName,player_id,team_id,team,PG,SG,SF,PF,C,date
0,C,12900,Nikola Jokic,28908111729,8,DEN,False,False,False,False,True,2025-02-08
1,C,12000,Victor Wembanyama,943740414489,27,SA,False,False,False,True,True,2025-02-08
2,SG,11600,Shai Gilgeous-Alexander,28778646789,21,OKC,True,False,False,False,False,2025-02-08
3,C,11000,Anthony Davis,28368759882,7,DAL,False,False,False,True,True,2025-02-08
4,SF,10700,LeBron James,2871601440,14,LAL,False,False,True,True,False,2025-02-08


In [39]:
today_df['game_id'] = ''

In [40]:
#Getting current day's game_ids
game_ids = []
no_game_dates = []

        
url = f"https://tank01-fantasy-stats.p.rapidapi.com/getNBAGamesForDate?gameDate={today}"

headers = {
    "x-rapidapi-key": API_KEY,
    "x-rapidapi-host": API_HOST
}

try:
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raise an error for HTTP codes 4xx/5xx
    result = response.json()
    
    if 'body' in result and result['body']:
        for game in result['body']:
            game_ids.append(game['gameID'])
    else:
        no_game_dates.append(current_date)  # Track dates with no games
except requests.exceptions.RequestException as e:
    print(f"Error fetching data for {current_date}: {e}")
except KeyError:
    print(f"Unexpected response format for {current_date}: {result}")

In [41]:
game_ids

['20250208_GS@CHI',
 '20250208_IND@LAL',
 '20250208_BOS@NY',
 '20250208_SA@ORL',
 '20250208_OKC@MEM',
 '20250208_DEN@PHO',
 '20250208_UTA@LAC',
 '20250208_NO@SAC',
 '20250208_HOU@DAL',
 '20250208_POR@MIN',
 '20250208_ATL@WAS']

In [42]:
# We need a check to make sure the game_ids are right
# One night we had WAS playing two games

In [43]:
from collections import Counter

teams = []
for id in game_ids:
    matchup = id[9:]
    both_teams = matchup.split('@')
    teams.append(both_teams[0])
    teams.append(both_teams[1])

if len(teams) != len(set(teams)):
    counter = Counter(teams)
    duplicates = [item for item, count in counter.items() if count > 1]
    print(f"These teams are in multiple game_ids: {duplicates}")
    

In [44]:
def fill_game_id(row, game_ids):
    for game_id in game_ids:
        if row['team'] in game_id:
            return game_id  # Return the matching game_id
    return row['game_id']  # If no match, return the original value

In [45]:
today_df['game_id'] = today_df.apply(
    lambda row: fill_game_id(row, game_ids), axis=1
)

In [46]:
today_df.head()

Unnamed: 0,prim_pos,salary,longName,player_id,team_id,team,PG,SG,SF,PF,C,date,game_id
0,C,12900,Nikola Jokic,28908111729,8,DEN,False,False,False,False,True,2025-02-08,20250208_DEN@PHO
1,C,12000,Victor Wembanyama,943740414489,27,SA,False,False,False,True,True,2025-02-08,20250208_SA@ORL
2,SG,11600,Shai Gilgeous-Alexander,28778646789,21,OKC,True,False,False,False,False,2025-02-08,20250208_OKC@MEM
3,C,11000,Anthony Davis,28368759882,7,DAL,False,False,False,True,True,2025-02-08,20250208_HOU@DAL
4,SF,10700,LeBron James,2871601440,14,LAL,False,False,True,True,False,2025-02-08,20250208_IND@LAL


In [47]:
today_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   prim_pos   375 non-null    object        
 1   salary     375 non-null    object        
 2   longName   375 non-null    object        
 3   player_id  375 non-null    object        
 4   team_id    375 non-null    object        
 5   team       375 non-null    object        
 6   PG         375 non-null    bool          
 7   SG         375 non-null    bool          
 8   SF         375 non-null    bool          
 9   PF         375 non-null    bool          
 10  C          375 non-null    bool          
 11  date       375 non-null    datetime64[ns]
 12  game_id    375 non-null    object        
dtypes: bool(5), datetime64[ns](1), object(7)
memory usage: 25.4+ KB


In [48]:
rows_with_missing_values = main_df_sorted[main_df_sorted[num_cols].isnull().any(axis=1)]

In [49]:
rows_with_missing_values

Unnamed: 0,longName,game_id,player_id,team_id,team,teamAbv,fga,ast,tptfgm,fgm,...,blk,DefReb,plusMinus,stl,pts,fouls,TOV,usage,mins,date


In [51]:
#Concatenating past day with current day's data points, then sorting again.
main_df_sorted = pd.concat([main_df_sorted, today_df], ignore_index=True)

In [52]:
main_df_sorted = main_df_sorted.sort_values(['player_id', 'date', 'game_id']).reset_index(drop=True)

In [53]:
rows_with_missing_values = main_df_sorted[main_df_sorted[num_cols].isnull().any(axis=1)]

In [54]:
len(rows_with_missing_values)

375

In [55]:
rows_with_missing_values.head()

Unnamed: 0,longName,game_id,player_id,team_id,team,teamAbv,fga,ast,tptfgm,fgm,...,usage,mins,date,prim_pos,salary,PG,SG,SF,PF,C
181,Dejounte Murray,20250208_NO@SAC,28008317499,19,NO,,,,,,...,,,2025-02-08,PG,3500,True,False,False,False,False
246,Shake Milton,20250208_IND@LAL,28008397499,14,LAL,,,,,,...,,,2025-02-08,SG,3700,True,False,False,False,False
340,Steven Adams,20250208_HOU@DAL,28018735349,11,HOU,,,,,,...,,,2025-02-08,C,4300,False,False,False,False,True
662,Gary Payton II,20250208_GS@CHI,28038983399,10,GS,,,,,,...,,,2025-02-08,PG,3700,True,True,False,False,False
772,Buddy Hield,20250208_GS@CHI,28038998249,10,GS,,,,,,...,,,2025-02-08,SF,4800,True,True,False,False,False


In [56]:
# Calculate the rolling mean for the last 15 games
for col in num_cols:
    main_df_sorted[col] = (
        main_df_sorted.groupby('player_id')[col]
        .apply(lambda x: x.shift(1).rolling(window=15, min_periods=1).mean()).reset_index(drop = True)
    )

In [57]:
main_df_sorted = main_df_sorted.sort_values(by = ['player_id', 'date', 'game_id']).reset_index(drop = True)

In [58]:
main_df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34941 entries, 0 to 34940
Data columns (total 31 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   longName   34941 non-null  object        
 1   game_id    34941 non-null  object        
 2   player_id  34941 non-null  object        
 3   team_id    34941 non-null  object        
 4   team       34941 non-null  object        
 5   teamAbv    34566 non-null  object        
 6   fga        34268 non-null  float64       
 7   ast        34268 non-null  float64       
 8   tptfgm     34268 non-null  float64       
 9   fgm        34268 non-null  float64       
 10  fta        34268 non-null  float64       
 11  tptfga     34268 non-null  float64       
 12  OffReb     34268 non-null  float64       
 13  ftm        34268 non-null  float64       
 14  blk        34268 non-null  float64       
 15  DefReb     34268 non-null  float64       
 16  plusMinus  34268 non-null  float64      

In [60]:
rows_with_missing_values = main_df_sorted[main_df_sorted[num_cols].isnull().any(axis=1)]

In [61]:
len(rows_with_missing_values)

673

In [62]:
#No one player should have more than one set of missing values
value_counts = rows_with_missing_values['longName'].value_counts()

In [63]:
if not (value_counts == 1).all():
    print("Some values in 'longName' appear more than once.")
    # Optionally print the offending values
    print("Offending values:\n", value_counts[value_counts > 1])

In [64]:
# Identify the count of each player_id
player_counts = main_df_sorted['player_id'].value_counts()

# Separate rows based on player_id counts
single_occurrence = main_df_sorted['player_id'].isin(player_counts[player_counts == 1].index)

# Fill missing values for rows where player_id occurs only once
numeric_min = main_df_sorted.select_dtypes(include='number').min()
main_df_sorted.loc[single_occurrence] = main_df_sorted.loc[single_occurrence].fillna(numeric_min)

# Backfill missing values for all other rows
main_df_sorted = main_df_sorted.fillna(method='bfill')


  main_df_sorted = main_df_sorted.fillna(method='bfill')


In [65]:
main_df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34941 entries, 0 to 34940
Data columns (total 31 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   longName   34941 non-null  object        
 1   game_id    34941 non-null  object        
 2   player_id  34941 non-null  object        
 3   team_id    34941 non-null  object        
 4   team       34941 non-null  object        
 5   teamAbv    34941 non-null  object        
 6   fga        34941 non-null  float64       
 7   ast        34941 non-null  float64       
 8   tptfgm     34941 non-null  float64       
 9   fgm        34941 non-null  float64       
 10  fta        34941 non-null  float64       
 11  tptfga     34941 non-null  float64       
 12  OffReb     34941 non-null  float64       
 13  ftm        34941 non-null  float64       
 14  blk        34941 non-null  float64       
 15  DefReb     34941 non-null  float64       
 16  plusMinus  34941 non-null  float64      

In [66]:
# main_df_sorted = main_df_sorted[main_df_sorted['date'] == today]

In [67]:
main_df_sorted.columns

Index(['longName', 'game_id', 'player_id', 'team_id', 'team', 'teamAbv', 'fga',
       'ast', 'tptfgm', 'fgm', 'fta', 'tptfga', 'OffReb', 'ftm', 'blk',
       'DefReb', 'plusMinus', 'stl', 'pts', 'fouls', 'TOV', 'usage', 'mins',
       'date', 'prim_pos', 'salary', 'PG', 'SG', 'SF', 'PF', 'C'],
      dtype='object')

In [68]:
main_df_sorted = main_df_sorted.sort_values(by = ['player_id', 'date', 'game_id']).reset_index(drop = True)

# Total minutes
Since a team's minutes will total 240 in a game that doesn't go into overtime, we want to see how close the minutes variables come to 240 when grouped by team and game.

In [69]:
# Group by 'team' and 'game_id', and sum the 'mins' variable for each group
team_game_mins = main_df_sorted.groupby(['team', 'game_id'])['mins'].sum().reset_index()

In [70]:
team_game_mins['mins'].describe()

count    3234.000000
mean      239.782124
std        29.404289
min        87.283333
25%       222.855470
50%       240.195833
75%       257.312121
max       397.911111
Name: mins, dtype: float64

In [71]:
team_game_mins = team_game_mins.rename(columns = {'mins': 'total_mins'})

In [72]:
team_game_mins.head()

Unnamed: 0,team,game_id,total_mins
0,ATL,20240103_OKC@ATL,240.0
1,ATL,20240105_ATL@IND,261.0
2,ATL,20240107_ATL@ORL,223.5
3,ATL,20240110_PHI@ATL,216.0
4,ATL,20240112_IND@ATL,271.833333


We'll merge this with main_df_sorted and then derive mins_share and mins_proj variables.

In [73]:
main_df_sorted = pd.merge(main_df_sorted, team_game_mins, on = ['team', 'game_id'], how = 'left')

In [74]:
main_df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34941 entries, 0 to 34940
Data columns (total 32 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   longName    34941 non-null  object        
 1   game_id     34941 non-null  object        
 2   player_id   34941 non-null  object        
 3   team_id     34941 non-null  object        
 4   team        34941 non-null  object        
 5   teamAbv     34941 non-null  object        
 6   fga         34941 non-null  float64       
 7   ast         34941 non-null  float64       
 8   tptfgm      34941 non-null  float64       
 9   fgm         34941 non-null  float64       
 10  fta         34941 non-null  float64       
 11  tptfga      34941 non-null  float64       
 12  OffReb      34941 non-null  float64       
 13  ftm         34941 non-null  float64       
 14  blk         34941 non-null  float64       
 15  DefReb      34941 non-null  float64       
 16  plusMinus   34941 non-

# mins_share variable
We have a few players projected to play well over 48 minutes. That's because the mins_share variable adds up to 240 for each team in each game. In regulation, each team has five players on the floor for 48 minutes. If we don't have enough players on a team where their L15 average minutes add up to 240, we end up with some players with a min_proj variable over 48 to make up for it.

In [75]:
main_df_sorted['mins_share'] = main_df_sorted['mins']/main_df_sorted['total_mins']

In [76]:
main_df_sorted['mins_share'].describe()

count    34941.000000
mean         0.092556
std          0.041825
min          0.000000
25%          0.060975
50%          0.094340
75%          0.125650
max          0.288152
Name: mins_share, dtype: float64

In [77]:
main_df_sorted['mins_proj'] = main_df_sorted['mins_share'] * 240

In [78]:
team_game_mins_proj = main_df_sorted.groupby(['team', 'game_id'])['mins_proj'].sum().reset_index()

In [79]:
main_df_sorted['mins_proj'].describe()

count    34941.000000
mean        22.213446
std         10.037959
min          0.000000
25%         14.633946
50%         22.641509
75%         30.155979
max         69.156594
Name: mins_proj, dtype: float64

In [80]:
main_df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34941 entries, 0 to 34940
Data columns (total 34 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   longName    34941 non-null  object        
 1   game_id     34941 non-null  object        
 2   player_id   34941 non-null  object        
 3   team_id     34941 non-null  object        
 4   team        34941 non-null  object        
 5   teamAbv     34941 non-null  object        
 6   fga         34941 non-null  float64       
 7   ast         34941 non-null  float64       
 8   tptfgm      34941 non-null  float64       
 9   fgm         34941 non-null  float64       
 10  fta         34941 non-null  float64       
 11  tptfga      34941 non-null  float64       
 12  OffReb      34941 non-null  float64       
 13  ftm         34941 non-null  float64       
 14  blk         34941 non-null  float64       
 15  DefReb      34941 non-null  float64       
 16  plusMinus   34941 non-

In [81]:
main_df_sorted.columns

Index(['longName', 'game_id', 'player_id', 'team_id', 'team', 'teamAbv', 'fga',
       'ast', 'tptfgm', 'fgm', 'fta', 'tptfga', 'OffReb', 'ftm', 'blk',
       'DefReb', 'plusMinus', 'stl', 'pts', 'fouls', 'TOV', 'usage', 'mins',
       'date', 'prim_pos', 'salary', 'PG', 'SG', 'SF', 'PF', 'C', 'total_mins',
       'mins_share', 'mins_proj'],
      dtype='object')

In [82]:
#This is the order of features in the model
model_order = ['longName', 'game_id', 'player_id', 'team_id', 'team', 'teamAbv',\
               'fga', 'ast', 'tptfgm', 'fgm', 'fta', 'tptfga', 'OffReb', 'ftm', 'blk',\
               'DefReb', 'plusMinus', 'stl', 'pts', 'fouls', 'TOV', 'usage', 'mins_share', 'mins', 'mins_proj', 'salary', 'date', 'prim_pos'] + positions

In [83]:
model_order

['longName',
 'game_id',
 'player_id',
 'team_id',
 'team',
 'teamAbv',
 'fga',
 'ast',
 'tptfgm',
 'fgm',
 'fta',
 'tptfga',
 'OffReb',
 'ftm',
 'blk',
 'DefReb',
 'plusMinus',
 'stl',
 'pts',
 'fouls',
 'TOV',
 'usage',
 'mins_share',
 'mins',
 'mins_proj',
 'salary',
 'date',
 'prim_pos',
 'PG',
 'SG',
 'SF',
 'PF',
 'C']

In [84]:
main_df_sorted = main_df_sorted[model_order]

In [85]:
main_df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34941 entries, 0 to 34940
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   longName    34941 non-null  object        
 1   game_id     34941 non-null  object        
 2   player_id   34941 non-null  object        
 3   team_id     34941 non-null  object        
 4   team        34941 non-null  object        
 5   teamAbv     34941 non-null  object        
 6   fga         34941 non-null  float64       
 7   ast         34941 non-null  float64       
 8   tptfgm      34941 non-null  float64       
 9   fgm         34941 non-null  float64       
 10  fta         34941 non-null  float64       
 11  tptfga      34941 non-null  float64       
 12  OffReb      34941 non-null  float64       
 13  ftm         34941 non-null  float64       
 14  blk         34941 non-null  float64       
 15  DefReb      34941 non-null  float64       
 16  plusMinus   34941 non-

This is where we break off the previous data and filter the dataframe for today's data only.

In [86]:
main_df_sorted = main_df_sorted[main_df_sorted['date'] == today]

In [87]:
main_df_sorted[positions] = main_df_sorted[positions].astype(bool)

In [88]:
main_df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
Index: 375 entries, 181 to 34834
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   longName    375 non-null    object        
 1   game_id     375 non-null    object        
 2   player_id   375 non-null    object        
 3   team_id     375 non-null    object        
 4   team        375 non-null    object        
 5   teamAbv     375 non-null    object        
 6   fga         375 non-null    float64       
 7   ast         375 non-null    float64       
 8   tptfgm      375 non-null    float64       
 9   fgm         375 non-null    float64       
 10  fta         375 non-null    float64       
 11  tptfga      375 non-null    float64       
 12  OffReb      375 non-null    float64       
 13  ftm         375 non-null    float64       
 14  blk         375 non-null    float64       
 15  DefReb      375 non-null    float64       
 16  plusMinus   375 non-null   

In [89]:
#Writing to csv
main_df_sorted.to_csv('../notebooks/ready_for_injuries.csv', index = False)

In [90]:
main_df_sorted.head()

Unnamed: 0,longName,game_id,player_id,team_id,team,teamAbv,fga,ast,tptfgm,fgm,...,mins,mins_proj,salary,date,prim_pos,PG,SG,SF,PF,C
181,Dejounte Murray,20250208_NO@SAC,28008317499,19,NO,MIN,15.6,7.2,1.933333,6.6,...,31.133333,20.802673,3500,2025-02-08,PG,True,False,False,False,False
246,Shake Milton,20250208_IND@LAL,28008397499,14,LAL,ORL,3.866667,2.2,0.666667,1.8,...,14.466667,10.413917,3700,2025-02-08,SG,True,False,False,False,False
340,Steven Adams,20250208_HOU@DAL,28018735349,11,HOU,TOR,3.733333,1.133333,0.0,2.066667,...,15.533333,10.91122,4300,2025-02-08,C,False,False,False,False,True
662,Gary Payton II,20250208_GS@CHI,28038983399,10,GS,IND,4.333333,1.4,0.466667,2.333333,...,13.733333,10.965205,3700,2025-02-08,PG,True,True,False,False,False
772,Buddy Hield,20250208_GS@CHI,28038998249,10,GS,GS,9.933333,2.333333,2.266667,3.8,...,25.133333,20.067389,4800,2025-02-08,SF,True,True,False,False,False


In [267]:
# #import sqlite3

# # Assuming your updated DataFrame is called `df`

# # Connect to the SQLite database
# conn = sqlite3.connect("nba_dfs_model.db")

# # Step 1: Drop the existing table if it exists
# conn.execute("DROP TABLE IF EXISTS dataset_with_L15")

# # Step 2: Create the table with all columns from the new DataFrame
# conn.execute("""
# CREATE TABLE dataset_with_L15 (
#     id INTEGER PRIMARY KEY,
#     Name TEXT,
#     game_id TEXT,
#     player_id TEXT,
#     reb INTEGER,
#     team_id TEXT,
#     team TEXT,
#     fd_pts REAL,
#     dk_pts REAL,
#     away_team TEXT,
#     home_team TEXT,
#     away_spread REAL,
#     home_spread REAL,
#     over_under REAL,
#     home INTEGER,
#     opponent TEXT,
#     PG_FD REAL,
#     SG_FD REAL,
#     SF_FD REAL,
#     PF_FD REAL,
#     C_FD REAL,
#     PG_DK REAL,
#     SG_DK REAL,
#     SF_DK REAL,
#     PF_DK REAL,
#     C_DK REAL,
#     date TEXT,
#     PG_FD_DvP REAL,
#     SG_FD_DvP REAL,
#     SF_FD_DvP REAL,
#     PF_FD_DvP REAL,
#     C_FD_DvP REAL,
#     PG_DK_DvP REAL,
#     SG_DK_DvP REAL,
#     SF_DK_DvP REAL,
#     PF_DK_DvP REAL,
#     C_DK_DvP REAL,
#     days_rest REAL,
#     fga REAL,
#     ast REAL,
#     tptfgm REAL,
#     fgm REAL,
#     mins REAL,
#     fta REAL,
#     tptfga REAL,
#     OffReb REAL,
#     ftm REAL,
#     blk REAL,
#     DefReb REAL,
#     plusMinus REAL,
#     stl REAL,
#     pts REAL,
#     PF REAL,
#     TOV REAL,
#     usage REAL,
#     mins_48 REAL,
#     pace REAL,
#     opp_pace REAL,
#     pace_diff REAL
# )
# """)

# # Step 3: Insert the updated data into the new table
# main_df_sorted.to_sql("dataset_with_L15", conn, if_exists="append", index=False)

# # Close the connection
# conn.close()

# print("Table successfully replaced with the updated DataFrame!")


In [268]:
# # Reconnect to the database
# conn = sqlite3.connect('nba_dfs_model.db')

# # Check the table schema
# cursor = conn.cursor()
# cursor.execute("PRAGMA table_info(dataset_with_L15);")
# schema = cursor.fetchall()

# # Print the schema
# for column in schema:
#     print(column)

# # Close the connection
# conn.close()


In [269]:
# # Connect to your SQLite database
# conn = sqlite3.connect('nba_dfs_model.db')

# # Query to get the list of all table names
# cursor = conn.cursor()
# cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
# tables = cursor.fetchall()

# # Close the connection
# conn.close()

# # Print the list of table names
# table_names = [table[0] for table in tables]
# print("Tables in the database:", table_names)

In [270]:
# main_df_sorted.tail()

In [271]:
# main_df_sorted = main_df_sorted.sort_values(by = 'date', ascending = False)

In [272]:
# len(main_df_sorted)

In [273]:
# main_df_sorted = main_df_sorted.iloc[33000:, :]

In [274]:
# len(main_df_sorted)

In [275]:
# api_key = "3103a75392msh7bce7c32fde122cp134393jsn4d42ed6d08a8"

In [276]:
# player_ids = list(main_df_sorted['player_id'].unique())

In [277]:
# len(player_ids)

In [278]:
# col_heads = ['name', 'player_id', 'injury', 'inj_date', 'status', 'return_date', 'last_game']
# rows = []
# for id in player_ids:
    
#     url = f"https://tank01-fantasy-stats.p.rapidapi.com/getNBAPlayerInfo?playerID={player_id}"
        
#     headers = {
#         "x-rapidapi-key": "3103a75392msh7bce7c32fde122cp134393jsn4d42ed6d08a8",
#         "x-rapidapi-host": "tank01-fantasy-stats.p.rapidapi.com"
#     }
    
#     response = requests.get(url, headers=headers)
#     result = response.json()
    
#     if 'body' in result and result['body']:
#         if any(value != '' for value in result['body']['injury'].values()):
#             rows.append([result['body']['longName'],
#                         result['body']['playerID'],
#                         result['body']['injury']['description'],
#                         result['body']['injury']['injDate'],
#                         result['body']['injury']['designation'],
#                         result['body']['injury']['injReturnDate'],
#                         result['body']['lastGamePlayed']
#                         ])

# injury_df = pd.DataFrame(rows, columns = col_heads)

In [279]:
injury_df

NameError: name 'injury_df' is not defined