# Feature Engineering
We're going to get last-15-game means for each player and we're going to derive mins_share and mins_proj features.<br>
We also might have to integrate FanDuel and DraftKings player lists into this notebook, along with the name matching, so that it can produce a dataset that can be used to make predictions.<br>
**This might end up being the Name_Matching notebook.**

In [268]:
import pandas as pd
import numpy as np
from datetime import datetime
import os
import requests
import sqlite3
import re
import matplotlib.pyplot as plt

In [269]:
today = (datetime.now()).strftime('%Y%m%d')

In [270]:
conn = sqlite3.connect("nba_dfs_model.db")
main_df = pd.read_sql_query("SELECT * FROM game_stats", conn)
conn.close()
main_df.head()

Unnamed: 0,longName,game_id,player_id,team_id,team,teamAbv,fga,ast,tptfgm,fgm,...,ftm,blk,DefReb,plusMinus,stl,pts,PF,TOV,usage,mins
0,Kris Dunn,20230228_SA@UTA,28128078249,29,UTA,UTA,7,3,0,4,...,3,0,0,-24,2,11,2,1,14.24,26
1,Kelly Olynyk,20230228_SA@UTA,28188235349,29,UTA,UTA,4,1,0,0,...,8,0,9,11,1,8,1,4,17.58,30
2,Udoka Azubuike,20230228_SA@UTA,28818826399,29,UTA,UTA,2,0,0,1,...,0,1,1,-6,0,2,1,0,12.71,9
3,Jeremy Sochan,20230228_SA@UTA,942647035539,27,SA,SA,10,6,0,4,...,5,0,6,10,0,13,4,1,21.46,28
4,Keita Bates-Diop,20230228_SA@UTA,28698359129,27,SA,SA,8,2,0,3,...,0,1,5,2,1,6,3,1,10.95,34


In [271]:
main_df.tail()

Unnamed: 0,longName,game_id,player_id,team_id,team,teamAbv,fga,ast,tptfgm,fgm,...,ftm,blk,DefReb,plusMinus,stl,pts,PF,TOV,usage,mins
50927,Javonte Green,20250124_NO@MEM,28548765927,19,NO,NO,6,1,2,3,...,0,0,2,12,0,8,2,2,15.86,22
50928,Brandon Clarke,20250124_NO@MEM,28498397499,15,MEM,MEM,10,0,0,7,...,0,0,2,-9,2,14,5,1,21.39,21
50929,CJ McCollum,20250124_NO@MEM,28168235349,19,NO,NO,11,4,3,6,...,7,1,1,-10,0,22,3,2,22.88,34
50930,Jordan Hawkins,20250124_NO@MEM,943049265539,19,NO,NO,18,1,4,8,...,0,0,2,-6,2,20,0,1,25.89,32
50931,Karlo Matkovic,20250124_NO@MEM,948942345669,19,NO,NO,0,0,0,0,...,0,0,1,4,0,0,0,0,0.0,3


In [272]:
len(main_df)

50932

In [273]:
# Check for duplicates in the game_id and player_id combination
duplicates = main_df.duplicated(subset=["game_id", "player_id"])

# Filter the duplicates
duplicate_rows = main_df[duplicates]


In [274]:
duplicate_rows

Unnamed: 0,longName,game_id,player_id,team_id,team,teamAbv,fga,ast,tptfgm,fgm,...,ftm,blk,DefReb,plusMinus,stl,pts,PF,TOV,usage,mins


In [275]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50932 entries, 0 to 50931
Data columns (total 23 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   longName   50932 non-null  object 
 1   game_id    50932 non-null  object 
 2   player_id  50932 non-null  object 
 3   team_id    50932 non-null  object 
 4   team       50932 non-null  object 
 5   teamAbv    50932 non-null  object 
 6   fga        50932 non-null  object 
 7   ast        50932 non-null  object 
 8   tptfgm     50932 non-null  object 
 9   fgm        50932 non-null  object 
 10  fta        50932 non-null  object 
 11  tptfga     50932 non-null  object 
 12  OffReb     50932 non-null  object 
 13  ftm        50932 non-null  object 
 14  blk        50932 non-null  object 
 15  DefReb     50932 non-null  object 
 16  plusMinus  50932 non-null  object 
 17  stl        50932 non-null  object 
 18  pts        50932 non-null  object 
 19  PF         50932 non-null  object 
 20  TOV   

In [276]:
#Deriving a date column, and then filtering out anything before Jan. 1, 2024
main_df['date'] = pd.to_datetime(main_df['game_id'].str[:8])

In [277]:
main_df = main_df[main_df['date'] >= '2024-01-01']

In [278]:
main_df.columns

Index(['longName', 'game_id', 'player_id', 'team_id', 'team', 'teamAbv', 'fga',
       'ast', 'tptfgm', 'fgm', 'fta', 'tptfga', 'OffReb', 'ftm', 'blk',
       'DefReb', 'plusMinus', 'stl', 'pts', 'PF', 'TOV', 'usage', 'mins',
       'date'],
      dtype='object')

In [279]:
num_cols = ['fga', 'ast', 'tptfgm', 'fgm', 'fta', 'tptfga', 'OffReb', 'ftm', 'blk', 'DefReb', 'plusMinus', 'stl', 'pts', 'PF', 'TOV', 'usage', 'mins']

In [280]:
main_df[num_cols] = main_df[num_cols].apply(pd.to_numeric, errors = 'coerce')

In [281]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32262 entries, 18670 to 50931
Data columns (total 24 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   longName   32262 non-null  object        
 1   game_id    32262 non-null  object        
 2   player_id  32262 non-null  object        
 3   team_id    32262 non-null  object        
 4   team       32262 non-null  object        
 5   teamAbv    32262 non-null  object        
 6   fga        32262 non-null  int64         
 7   ast        32262 non-null  int64         
 8   tptfgm     32262 non-null  int64         
 9   fgm        32262 non-null  int64         
 10  fta        32262 non-null  int64         
 11  tptfga     32262 non-null  int64         
 12  OffReb     32262 non-null  int64         
 13  ftm        32262 non-null  int64         
 14  blk        32262 non-null  int64         
 15  DefReb     32262 non-null  int64         
 16  plusMinus  32262 non-null  int64         

In [282]:
main_df_sorted = main_df.sort_values(by = ['player_id', 'date', 'game_id']).reset_index(drop = True)

In [283]:
main_df_sorted.head()

Unnamed: 0,longName,game_id,player_id,team_id,team,teamAbv,fga,ast,tptfgm,fgm,...,blk,DefReb,plusMinus,stl,pts,PF,TOV,usage,mins,date
0,Brook Lopez,20240101_IND@MIL,28006619932,17,MIL,MIL,17,1,2,8,...,4,4,6,0,21,3,0,20.53,38,2024-01-01
1,Brook Lopez,20240103_MIL@IND,28006619932,17,MIL,MIL,11,1,2,6,...,0,2,-22,0,16,3,2,20.62,28,2024-01-03
2,Brook Lopez,20240104_MIL@SA,28006619932,17,MIL,MIL,10,1,1,4,...,3,4,8,0,9,4,4,17.81,33,2024-01-04
3,Brook Lopez,20240106_MIL@HOU,28006619932,17,MIL,MIL,9,0,1,3,...,1,5,11,1,7,2,0,13.21,31,2024-01-06
4,Brook Lopez,20240108_UTA@MIL,28006619932,17,MIL,MIL,11,3,3,5,...,2,5,-9,0,13,2,0,12.48,38,2024-01-08


In [284]:
main_df_sorted.tail()

Unnamed: 0,longName,game_id,player_id,team_id,team,teamAbv,fga,ast,tptfgm,fgm,...,blk,DefReb,plusMinus,stl,pts,PF,TOV,usage,mins,date
32257,Matthew Hurt,20240306_MEM@PHI,94994462027,15,MEM,MEM,2,0,0,0,...,0,0,-3,0,0,0,0,13.78,6,2024-03-06
32258,Matthew Hurt,20240308_ATL@MEM,94994462027,15,MEM,MEM,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,2024-03-08
32259,Matthew Hurt,20240318_MEM@SAC,94994462027,15,MEM,MEM,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,2024-03-18
32260,Matthew Hurt,20240320_MEM@GS,94994462027,15,MEM,MEM,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,2024-03-20
32261,Matthew Hurt,20240322_MEM@SA,94994462027,15,MEM,MEM,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,2024-03-22


In [285]:
len(main_df_sorted)

32262

In [286]:
#api_names = main_df_sorted[['longName', 'player_id', 'team', 'date']]
# Keep only rows that are not duplicated across the specified columns
# We need to do this because there are some data points that are duplicated,
# possibly the NBA Cup games
main_df_sorted = main_df_sorted[~main_df_sorted.duplicated(subset=['longName', 'player_id', 'team', 'date'], keep=False)]

In [287]:
len(main_df_sorted)

32210

In [288]:
api_key = "3103a75392msh7bce7c32fde122cp134393jsn4d42ed6d08a8"

In [289]:
site = input('Are you playing FanDuel or DraftKings?').lower()
if site == 'fanduel':
    positions = ['PG', 'SG', 'SF', 'PF', 'C']
else: 
    positions = ['PG', 'SG', 'SF', 'PF', 'C', 'G', 'F', 'UTIL']

Are you playing FanDuel or DraftKings? FanDuel


In [290]:
heads = ['pos', 'salary', 'longName', 'player_id', 'team_id', 'team']
players = []
url = f"https://tank01-fantasy-stats.p.rapidapi.com/getNBADFS?date={today}"

headers = {
    "x-rapidapi-key": "3103a75392msh7bce7c32fde122cp134393jsn4d42ed6d08a8",
    "x-rapidapi-host": "tank01-fantasy-stats.p.rapidapi.com"
}

try:
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raise an error for HTTP codes 4xx/5xx
    result = response.json()

    if 'body' in result and result['body']:
        for player in result['body'][site]:
            players.append([player['allValidPositions'], player['salary'], player['longName'], player['playerID'], player['teamID'], player['team']])
    else:
        no_game_dates.append(current_date)  # Track dates with no games
except requests.exceptions.RequestException as e:
    print(f"Error fetching data for {current_date}: {e}")
except KeyError:
    print(f"Unexpected response format for {current_date}: {result}")

In [291]:
#need teamID, team, salary, playerID, longNAme, allValidPositions

In [292]:
today_df = pd.DataFrame(players)

In [293]:
today_df.head()

Unnamed: 0,0,1,2,3,4,5
0,[C],13000,Nikola Jokic,28908111729,8,DEN
1,"[PF, C]",12400,Victor Wembanyama,943740414489,27,SA
2,[PF],11900,Giannis Antetokounmpo,28118035349,17,MIL
3,"[C, PF]",11300,Anthony Davis,28368759882,14,LAL
4,[PG],10600,LaMelo Ball,94914298027,4,CHA


In [294]:
today_df.columns = heads

In [295]:
for position in positions:
    today_df[position] = 0

In [296]:
today_df.head()

Unnamed: 0,pos,salary,longName,player_id,team_id,team,PG,SG,SF,PF,C
0,[C],13000,Nikola Jokic,28908111729,8,DEN,0,0,0,0,0
1,"[PF, C]",12400,Victor Wembanyama,943740414489,27,SA,0,0,0,0,0
2,[PF],11900,Giannis Antetokounmpo,28118035349,17,MIL,0,0,0,0,0
3,"[C, PF]",11300,Anthony Davis,28368759882,14,LAL,0,0,0,0,0
4,[PG],10600,LaMelo Ball,94914298027,4,CHA,0,0,0,0,0


In [297]:
for position in positions:
    today_df[position] = today_df['pos'].apply(lambda x: True if position in x else False)

In [298]:
today_df.head()

Unnamed: 0,pos,salary,longName,player_id,team_id,team,PG,SG,SF,PF,C
0,[C],13000,Nikola Jokic,28908111729,8,DEN,False,False,False,False,True
1,"[PF, C]",12400,Victor Wembanyama,943740414489,27,SA,False,False,False,True,True
2,[PF],11900,Giannis Antetokounmpo,28118035349,17,MIL,False,False,False,True,False
3,"[C, PF]",11300,Anthony Davis,28368759882,14,LAL,False,False,False,True,True
4,[PG],10600,LaMelo Ball,94914298027,4,CHA,True,False,False,False,False


In [299]:
if site == 'draftkings':
    today_df['G'] =  np.where(today_df['PG'] + today_df['SG'] > 0, True, False)
    today_df['F'] =  np.where(today_df['SF'] + today_df['PF'] > 0, True, False)
    today_df['UTIL'] = True

In [300]:
today_df.head(10)

Unnamed: 0,pos,salary,longName,player_id,team_id,team,PG,SG,SF,PF,C
0,[C],13000,Nikola Jokic,28908111729,8,DEN,False,False,False,False,True
1,"[PF, C]",12400,Victor Wembanyama,943740414489,27,SA,False,False,False,True,True
2,[PF],11900,Giannis Antetokounmpo,28118035349,17,MIL,False,False,False,True,False
3,"[C, PF]",11300,Anthony Davis,28368759882,14,LAL,False,False,False,True,True
4,[PG],10600,LaMelo Ball,94914298027,4,CHA,True,False,False,False,False
5,"[C, PF]",10500,Domantas Sabonis,28118309129,26,SAC,False,False,False,True,True
6,[C],10300,Karl-Anthony Towns,28278119129,20,NY,False,False,False,False,True
7,[PG],10200,Cade Cunningham,94804285527,9,DET,True,False,False,False,False
8,"[PF, SF]",10100,Jayson Tatum,28628646399,2,BOS,False,False,True,True,False
9,"[PF, SF]",9900,Kevin Durant,28336662792,24,PHO,False,False,True,True,False


In [301]:
today_df = today_df.drop(columns = ['pos'])

In [302]:
today_df['date'] = pd.to_datetime(today)

In [303]:
today_df.head()

Unnamed: 0,salary,longName,player_id,team_id,team,PG,SG,SF,PF,C,date
0,13000,Nikola Jokic,28908111729,8,DEN,False,False,False,False,True,2025-01-25
1,12400,Victor Wembanyama,943740414489,27,SA,False,False,False,True,True,2025-01-25
2,11900,Giannis Antetokounmpo,28118035349,17,MIL,False,False,False,True,False,2025-01-25
3,11300,Anthony Davis,28368759882,14,LAL,False,False,False,True,True,2025-01-25
4,10600,LaMelo Ball,94914298027,4,CHA,True,False,False,False,False,2025-01-25


In [304]:
today_df['game_id'] = ''

In [305]:
game_ids = []
no_game_dates = []

        
url = f"https://tank01-fantasy-stats.p.rapidapi.com/getNBAGamesForDate?gameDate={today}"

headers = {
    "x-rapidapi-key": "3103a75392msh7bce7c32fde122cp134393jsn4d42ed6d08a8",
    "x-rapidapi-host": "tank01-fantasy-stats.p.rapidapi.com"
}

try:
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raise an error for HTTP codes 4xx/5xx
    result = response.json()
    
    if 'body' in result and result['body']:
        for game in result['body']:
            game_ids.append(game['gameID'])
    else:
        no_game_dates.append(current_date)  # Track dates with no games
except requests.exceptions.RequestException as e:
    print(f"Error fetching data for {current_date}: {e}")
except KeyError:
    print(f"Unexpected response format for {current_date}: {result}")

In [306]:
game_ids

['20250125_TOR@ATL',
 '20250125_IND@SA',
 '20250125_HOU@CLE',
 '20250125_UTA@MEM',
 '20250125_NO@CHA',
 '20250125_MIA@BKN',
 '20250125_BOS@DAL',
 '20250125_DEN@MIN',
 '20250125_SAC@NY',
 '20250125_WAS@PHO',
 '20250125_PHI@CHI',
 '20250125_DET@ORL',
 '20250125_LAL@GS',
 '20250125_MIL@LAC']

In [307]:
# We need a check to make sure the game_ids are right
# One night we had WAS playing two games

In [308]:
from collections import Counter

teams = []
for id in game_ids:
    matchup = id[9:]
    both_teams = matchup.split('@')
    teams.append(both_teams[0])
    teams.append(both_teams[1])

if len(teams) != len(set(teams)):
    counter = Counter(teams)
    duplicates = [item for item, count in counter.items() if count > 1]
    print(f"These teams are in multiple game_ids: {duplicates}")
    

In [309]:
def fill_game_id(row, game_ids):
    for game_id in game_ids:
        if row['team'] in game_id:
            return game_id  # Return the matching game_id
    return row['game_id']  # If no match, return the original value

In [310]:
today_df['game_id'] = today_df.apply(
    lambda row: fill_game_id(row, game_ids), axis=1
)

In [311]:
today_df.head()

Unnamed: 0,salary,longName,player_id,team_id,team,PG,SG,SF,PF,C,date,game_id
0,13000,Nikola Jokic,28908111729,8,DEN,False,False,False,False,True,2025-01-25,20250125_DEN@MIN
1,12400,Victor Wembanyama,943740414489,27,SA,False,False,False,True,True,2025-01-25,20250125_IND@SA
2,11900,Giannis Antetokounmpo,28118035349,17,MIL,False,False,False,True,False,2025-01-25,20250125_MIL@LAC
3,11300,Anthony Davis,28368759882,14,LAL,False,False,False,True,True,2025-01-25,20250125_LAL@GS
4,10600,LaMelo Ball,94914298027,4,CHA,True,False,False,False,False,2025-01-25,20250125_NO@CHA


In [312]:
today_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 483 entries, 0 to 482
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   salary     483 non-null    object        
 1   longName   483 non-null    object        
 2   player_id  483 non-null    object        
 3   team_id    483 non-null    object        
 4   team       483 non-null    object        
 5   PG         483 non-null    bool          
 6   SG         483 non-null    bool          
 7   SF         483 non-null    bool          
 8   PF         483 non-null    bool          
 9   C          483 non-null    bool          
 10  date       483 non-null    datetime64[ns]
 11  game_id    483 non-null    object        
dtypes: bool(5), datetime64[ns](1), object(6)
memory usage: 28.9+ KB


In [313]:
rows_with_missing_values = main_df_sorted[main_df_sorted[num_cols].isnull().any(axis=1)]

In [314]:
rows_with_missing_values

Unnamed: 0,longName,game_id,player_id,team_id,team,teamAbv,fga,ast,tptfgm,fgm,...,blk,DefReb,plusMinus,stl,pts,PF,TOV,usage,mins,date


In [315]:
rows_with_missing_values['longName'].value_counts()

Series([], Name: count, dtype: int64)

In [316]:
main_df_sorted = pd.concat([main_df_sorted, today_df], ignore_index=True)

In [317]:
main_df_sorted = main_df_sorted.sort_values(['player_id', 'date', 'game_id']).reset_index(drop=True)

In [318]:
rows_with_missing_values = main_df_sorted[main_df_sorted[num_cols].isnull().any(axis=1)]

In [319]:
len(rows_with_missing_values)

483

In [320]:
rows_with_missing_values.head()

Unnamed: 0,longName,game_id,player_id,team_id,team,teamAbv,fga,ast,tptfgm,fgm,...,PF,TOV,usage,mins,date,salary,PG,SG,SF,C
95,Brook Lopez,20250125_MIL@LAC,28006619932,17,MIL,,,,,,...,0,,,,2025-01-25,5900,False,False,False,True
171,Dejounte Murray,20250125_NO@CHA,28008317499,19,NO,,,,,,...,0,,,,2025-01-25,8700,True,False,False,False
231,Shake Milton,20250125_LAL@GS,28008397499,14,LAL,,,,,,...,0,,,,2025-01-25,3500,True,False,False,False
319,Steven Adams,20250125_HOU@CLE,28018735349,11,HOU,,,,,,...,0,,,,2025-01-25,4000,False,False,False,True
378,Garrett Temple,20250125_TOR@ATL,28026396452,28,TOR,,,,,,...,0,,,,2025-01-25,3500,False,False,True,False


In [321]:
# Calculate the rolling mean for the last 15 games
for col in num_cols:
    main_df_sorted[col] = (
        main_df_sorted.groupby('player_id')[col]
        .apply(lambda x: x.shift(1).rolling(window=15, min_periods=1).mean()).reset_index(drop = True)
    )

In [322]:
main_df_sorted = main_df_sorted.sort_values(by = ['player_id', 'date', 'game_id']).reset_index(drop = True)

In [323]:
main_df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32693 entries, 0 to 32692
Data columns (total 29 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   longName   32693 non-null  object        
 1   game_id    32693 non-null  object        
 2   player_id  32693 non-null  object        
 3   team_id    32693 non-null  object        
 4   team       32693 non-null  object        
 5   teamAbv    32210 non-null  object        
 6   fga        32023 non-null  float64       
 7   ast        32023 non-null  float64       
 8   tptfgm     32023 non-null  float64       
 9   fgm        32023 non-null  float64       
 10  fta        32023 non-null  float64       
 11  tptfga     32023 non-null  float64       
 12  OffReb     32023 non-null  float64       
 13  ftm        32023 non-null  float64       
 14  blk        32023 non-null  float64       
 15  DefReb     32023 non-null  float64       
 16  plusMinus  32023 non-null  float64      

In [324]:
# import missingno as msno 

In [325]:
rows_with_missing_values = main_df_sorted[main_df_sorted[num_cols].isnull().any(axis=1)]

In [326]:
len(rows_with_missing_values)

670

In [327]:
value_counts = rows_with_missing_values['longName'].value_counts()

In [328]:
if not (value_counts == 1).all():
    print("Some values in 'longName' appear more than once.")
    # Optionally print the offending values
    print("Offending values:\n", value_counts[value_counts > 1])

In [329]:
# Identify the count of each player_id
player_counts = main_df_sorted['player_id'].value_counts()

# Separate rows based on player_id counts
single_occurrence = main_df_sorted['player_id'].isin(player_counts[player_counts == 1].index)

# Fill missing values for rows where player_id occurs only once
numeric_min = main_df_sorted.select_dtypes(include='number').min()
main_df_sorted.loc[single_occurrence] = main_df_sorted.loc[single_occurrence].fillna(numeric_min)

# Backfill missing values for all other rows
main_df_sorted = main_df_sorted.fillna(method='bfill')


  main_df_sorted = main_df_sorted.fillna(method='bfill')


In [330]:
main_df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32693 entries, 0 to 32692
Data columns (total 29 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   longName   32693 non-null  object        
 1   game_id    32693 non-null  object        
 2   player_id  32693 non-null  object        
 3   team_id    32693 non-null  object        
 4   team       32693 non-null  object        
 5   teamAbv    32693 non-null  object        
 6   fga        32693 non-null  float64       
 7   ast        32693 non-null  float64       
 8   tptfgm     32693 non-null  float64       
 9   fgm        32693 non-null  float64       
 10  fta        32693 non-null  float64       
 11  tptfga     32693 non-null  float64       
 12  OffReb     32693 non-null  float64       
 13  ftm        32693 non-null  float64       
 14  blk        32693 non-null  float64       
 15  DefReb     32693 non-null  float64       
 16  plusMinus  32693 non-null  float64      

In [331]:
# main_df_sorted = main_df_sorted[main_df_sorted['date'] == today]

In [332]:
main_df_sorted.columns

Index(['longName', 'game_id', 'player_id', 'team_id', 'team', 'teamAbv', 'fga',
       'ast', 'tptfgm', 'fgm', 'fta', 'tptfga', 'OffReb', 'ftm', 'blk',
       'DefReb', 'plusMinus', 'stl', 'pts', 'PF', 'TOV', 'usage', 'mins',
       'date', 'salary', 'PG', 'SG', 'SF', 'C'],
      dtype='object')

In [333]:
main_df_sorted = main_df_sorted.sort_values(by = ['player_id', 'date', 'game_id']).reset_index(drop = True)

# Total minutes
Since a team's minutes will total 240 in a game that doesn't go into overtime, we want to see how close the minutes variables come to 240 when grouped by team and game.

In [334]:
# Group by 'team' and 'game_id', and sum the 'mins' variable for each group
team_game_mins = main_df_sorted.groupby(['team', 'game_id'])['mins'].sum().reset_index()

In [335]:
team_game_mins['mins'].describe()

count    3020.000000
mean      240.096208
std        30.014299
min        87.283333
25%       223.000000
50%       240.280952
75%       257.660417
max       406.355556
Name: mins, dtype: float64

In [336]:
team_game_mins = team_game_mins.rename(columns = {'mins': 'total_mins'})

In [337]:
team_game_mins.head()

Unnamed: 0,team,game_id,total_mins
0,ATL,20240103_OKC@ATL,240.0
1,ATL,20240105_ATL@IND,261.0
2,ATL,20240107_ATL@ORL,223.5
3,ATL,20240110_PHI@ATL,216.0
4,ATL,20240112_IND@ATL,271.833333


We'll merge this with main_df_sorted and then derive mins_share and mins_proj variables.

In [338]:
main_df_sorted = pd.merge(main_df_sorted, team_game_mins, on = ['team', 'game_id'], how = 'left')

In [339]:
main_df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32693 entries, 0 to 32692
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   longName    32693 non-null  object        
 1   game_id     32693 non-null  object        
 2   player_id   32693 non-null  object        
 3   team_id     32693 non-null  object        
 4   team        32693 non-null  object        
 5   teamAbv     32693 non-null  object        
 6   fga         32693 non-null  float64       
 7   ast         32693 non-null  float64       
 8   tptfgm      32693 non-null  float64       
 9   fgm         32693 non-null  float64       
 10  fta         32693 non-null  float64       
 11  tptfga      32693 non-null  float64       
 12  OffReb      32693 non-null  float64       
 13  ftm         32693 non-null  float64       
 14  blk         32693 non-null  float64       
 15  DefReb      32693 non-null  float64       
 16  plusMinus   32693 non-

# mins_share variable
We have a few players projected to play well over 48 minutes. That's because the mins_share variable adds up to 240 for each team in each game. In regulation, each team has five players on the floor for 48 minutes. If we don't have enough players on a team where their L15 average minutes add up to 240, we end up with some players with a min_proj variable over 48 to make up for it.

In [340]:
main_df_sorted['mins_share'] = main_df_sorted['mins']/main_df_sorted['total_mins']

In [341]:
main_df_sorted['mins_share'].describe()

count    32693.000000
mean         0.092375
std          0.041918
min          0.000000
25%          0.060705
50%          0.093981
75%          0.125628
max          0.288152
Name: mins_share, dtype: float64

In [342]:
main_df_sorted['mins_proj'] = main_df_sorted['mins_share'] * 240

In [343]:
team_game_mins_proj = main_df_sorted.groupby(['team', 'game_id'])['mins_proj'].sum().reset_index()

In [344]:
main_df_sorted['mins_proj'].describe()

count    32693.000000
mean        22.169883
std         10.060436
min          0.000000
25%         14.569144
50%         22.555450
75%         30.150754
max         69.156594
Name: mins_proj, dtype: float64

In [345]:
main_df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32693 entries, 0 to 32692
Data columns (total 32 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   longName    32693 non-null  object        
 1   game_id     32693 non-null  object        
 2   player_id   32693 non-null  object        
 3   team_id     32693 non-null  object        
 4   team        32693 non-null  object        
 5   teamAbv     32693 non-null  object        
 6   fga         32693 non-null  float64       
 7   ast         32693 non-null  float64       
 8   tptfgm      32693 non-null  float64       
 9   fgm         32693 non-null  float64       
 10  fta         32693 non-null  float64       
 11  tptfga      32693 non-null  float64       
 12  OffReb      32693 non-null  float64       
 13  ftm         32693 non-null  float64       
 14  blk         32693 non-null  float64       
 15  DefReb      32693 non-null  float64       
 16  plusMinus   32693 non-

In [346]:
main_df_sorted.columns

Index(['longName', 'game_id', 'player_id', 'team_id', 'team', 'teamAbv', 'fga',
       'ast', 'tptfgm', 'fgm', 'fta', 'tptfga', 'OffReb', 'ftm', 'blk',
       'DefReb', 'plusMinus', 'stl', 'pts', 'PF', 'TOV', 'usage', 'mins',
       'date', 'salary', 'PG', 'SG', 'SF', 'C', 'total_mins', 'mins_share',
       'mins_proj'],
      dtype='object')

In [347]:
#This is the order of features in the model
model_order = ['longName', 'game_id', 'player_id', 'team_id', 'team', 'teamAbv',\
               'fga', 'ast', 'tptfgm', 'fgm', 'fta', 'tptfga', 'OffReb', 'ftm', 'blk',\
               'DefReb', 'plusMinus', 'stl', 'pts', 'PF', 'TOV', 'usage', 'mins_share', 'mins', 'mins_proj', 'salary', 'date'] + positions

In [348]:
model_order

['longName',
 'game_id',
 'player_id',
 'team_id',
 'team',
 'teamAbv',
 'fga',
 'ast',
 'tptfgm',
 'fgm',
 'fta',
 'tptfga',
 'OffReb',
 'ftm',
 'blk',
 'DefReb',
 'plusMinus',
 'stl',
 'pts',
 'PF',
 'TOV',
 'usage',
 'mins_share',
 'mins',
 'mins_proj',
 'salary',
 'date',
 'PG',
 'SG',
 'SF',
 'PF',
 'C']

In [349]:
main_df_sorted = main_df_sorted[model_order]

In [350]:
main_df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32693 entries, 0 to 32692
Data columns (total 32 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   longName    32693 non-null  object        
 1   game_id     32693 non-null  object        
 2   player_id   32693 non-null  object        
 3   team_id     32693 non-null  object        
 4   team        32693 non-null  object        
 5   teamAbv     32693 non-null  object        
 6   fga         32693 non-null  float64       
 7   ast         32693 non-null  float64       
 8   tptfgm      32693 non-null  float64       
 9   fgm         32693 non-null  float64       
 10  fta         32693 non-null  float64       
 11  tptfga      32693 non-null  float64       
 12  OffReb      32693 non-null  float64       
 13  ftm         32693 non-null  float64       
 14  blk         32693 non-null  float64       
 15  DefReb      32693 non-null  float64       
 16  plusMinus   32693 non-

In [351]:
main_df_sorted = main_df_sorted[main_df_sorted['date'] == today]

In [352]:
main_df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
Index: 483 entries, 95 to 32678
Data columns (total 32 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   longName    483 non-null    object        
 1   game_id     483 non-null    object        
 2   player_id   483 non-null    object        
 3   team_id     483 non-null    object        
 4   team        483 non-null    object        
 5   teamAbv     483 non-null    object        
 6   fga         483 non-null    float64       
 7   ast         483 non-null    float64       
 8   tptfgm      483 non-null    float64       
 9   fgm         483 non-null    float64       
 10  fta         483 non-null    float64       
 11  tptfga      483 non-null    float64       
 12  OffReb      483 non-null    float64       
 13  ftm         483 non-null    float64       
 14  blk         483 non-null    float64       
 15  DefReb      483 non-null    float64       
 16  plusMinus   483 non-null    

In [353]:
main_df_sorted.to_csv('ready_for_pred.csv', index = False)

In [277]:
# #import sqlite3

# # Assuming your updated DataFrame is called `df`

# # Connect to the SQLite database
# conn = sqlite3.connect("nba_dfs_model.db")

# # Step 1: Drop the existing table if it exists
# conn.execute("DROP TABLE IF EXISTS dataset_with_L15")

# # Step 2: Create the table with all columns from the new DataFrame
# conn.execute("""
# CREATE TABLE dataset_with_L15 (
#     id INTEGER PRIMARY KEY,
#     Name TEXT,
#     game_id TEXT,
#     player_id TEXT,
#     reb INTEGER,
#     team_id TEXT,
#     team TEXT,
#     fd_pts REAL,
#     dk_pts REAL,
#     away_team TEXT,
#     home_team TEXT,
#     away_spread REAL,
#     home_spread REAL,
#     over_under REAL,
#     home INTEGER,
#     opponent TEXT,
#     PG_FD REAL,
#     SG_FD REAL,
#     SF_FD REAL,
#     PF_FD REAL,
#     C_FD REAL,
#     PG_DK REAL,
#     SG_DK REAL,
#     SF_DK REAL,
#     PF_DK REAL,
#     C_DK REAL,
#     date TEXT,
#     PG_FD_DvP REAL,
#     SG_FD_DvP REAL,
#     SF_FD_DvP REAL,
#     PF_FD_DvP REAL,
#     C_FD_DvP REAL,
#     PG_DK_DvP REAL,
#     SG_DK_DvP REAL,
#     SF_DK_DvP REAL,
#     PF_DK_DvP REAL,
#     C_DK_DvP REAL,
#     days_rest REAL,
#     fga REAL,
#     ast REAL,
#     tptfgm REAL,
#     fgm REAL,
#     mins REAL,
#     fta REAL,
#     tptfga REAL,
#     OffReb REAL,
#     ftm REAL,
#     blk REAL,
#     DefReb REAL,
#     plusMinus REAL,
#     stl REAL,
#     pts REAL,
#     PF REAL,
#     TOV REAL,
#     usage REAL,
#     mins_48 REAL,
#     pace REAL,
#     opp_pace REAL,
#     pace_diff REAL
# )
# """)

# # Step 3: Insert the updated data into the new table
# main_df_sorted.to_sql("dataset_with_L15", conn, if_exists="append", index=False)

# # Close the connection
# conn.close()

# print("Table successfully replaced with the updated DataFrame!")


Table successfully replaced with the updated DataFrame!


In [278]:
# # Reconnect to the database
# conn = sqlite3.connect('nba_dfs_model.db')

# # Check the table schema
# cursor = conn.cursor()
# cursor.execute("PRAGMA table_info(dataset_with_L15);")
# schema = cursor.fetchall()

# # Print the schema
# for column in schema:
#     print(column)

# # Close the connection
# conn.close()


(0, 'id', 'INTEGER', 0, None, 1)
(1, 'Name', 'TEXT', 0, None, 0)
(2, 'game_id', 'TEXT', 0, None, 0)
(3, 'player_id', 'TEXT', 0, None, 0)
(4, 'reb', 'INTEGER', 0, None, 0)
(5, 'team_id', 'TEXT', 0, None, 0)
(6, 'team', 'TEXT', 0, None, 0)
(7, 'fd_pts', 'REAL', 0, None, 0)
(8, 'dk_pts', 'REAL', 0, None, 0)
(9, 'away_team', 'TEXT', 0, None, 0)
(10, 'home_team', 'TEXT', 0, None, 0)
(11, 'away_spread', 'REAL', 0, None, 0)
(12, 'home_spread', 'REAL', 0, None, 0)
(13, 'over_under', 'REAL', 0, None, 0)
(14, 'home', 'INTEGER', 0, None, 0)
(15, 'opponent', 'TEXT', 0, None, 0)
(16, 'PG_FD', 'REAL', 0, None, 0)
(17, 'SG_FD', 'REAL', 0, None, 0)
(18, 'SF_FD', 'REAL', 0, None, 0)
(19, 'PF_FD', 'REAL', 0, None, 0)
(20, 'C_FD', 'REAL', 0, None, 0)
(21, 'PG_DK', 'REAL', 0, None, 0)
(22, 'SG_DK', 'REAL', 0, None, 0)
(23, 'SF_DK', 'REAL', 0, None, 0)
(24, 'PF_DK', 'REAL', 0, None, 0)
(25, 'C_DK', 'REAL', 0, None, 0)
(26, 'date', 'TEXT', 0, None, 0)
(27, 'PG_FD_DvP', 'REAL', 0, None, 0)
(28, 'SG_FD_DvP', 

In [279]:
# # Connect to your SQLite database
# conn = sqlite3.connect('nba_dfs_model.db')

# # Query to get the list of all table names
# cursor = conn.cursor()
# cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
# tables = cursor.fetchall()

# # Close the connection
# conn.close()

# # Print the list of table names
# table_names = [table[0] for table in tables]
# print("Tables in the database:", table_names)

Tables in the database: ['api_players', 'fd_players', 'dk_players', 'game_schedule', 'game_stats', 'sqlite_sequence', 'pace', 'betting_lines', 'merged_data', 'dataset_pre_EDA', 'dataset_with_L15']
