In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression

# Generate synthetic data
X, y = make_regression(n_samples=100, n_features=20, noise=0.1, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('feature_selection', SelectKBest(score_func=f_regression)),  # Select top K features
    ('model', LinearRegression())  # Linear regression model
])

# Define the grid search parameters (testing different numbers of features)
param_grid = {
    'feature_selection__k': [5, 10, 15, 20]  # Test different numbers of selected features
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2')  # Using R² score
grid_search.fit(X, y)

# Print best parameters
print("Best number of features:", grid_search.best_params_)
print("Best R² score:", grid_search.best_score_)

In [69]:

from data_functions import his_usage_team

player_names = {'Alex Caruso':'OKC', 'Isaiah Hartenstein':'OKC', 'Shai Gilgeous-Alexander':'OKC'}
date_list = ["2022-23","2023-24","2024-25"]
usage_path = "D:/nba_usage_csv_historic/usage_csv_{date}/{date}_content.csv"
schedule_base_path = "D:/nba_scheduled_csv/schedule_csv_2025/{schedule_team}_schedule_content.csv"
player_base_path = "D:/nba_player_csv_historic/season_{date}/all_quarters/{player}_content.csv"
defense_base_path = "D:/nba_defense_history_csv/defense_csv_{date}/all_quarter_defense_content.csv"


player_df, current_defense_stat = his_usage_team(player_names, date_list, usage_path, player_base_path, defense_base_path)



for player, df in player_df.items():
    print(player)



    from sklearn.model_selection import GridSearchCV
    from sklearn.feature_selection import SelectKBest, f_regression
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import LinearRegression
    from sklearn.datasets import make_regression
    import numpy as np
    from sklearn.preprocessing import StandardScaler
    import pandas as pd


    df_X = df.drop(columns=['OREB','PTS','Date','Matchup','Team','Home/Away_game','W/L', 'Away', 'season', 'TEAM', 'season_defense'])

    # print(len(df_X.columns))

    # X = df.drop('target_column', axis=1)  # Replace with your actual feature
    


    # X = df_X
    # scaler = StandardScaler()
    # X = scaler.fit_transform(X)

    scaler = StandardScaler()
    X = scaler.fit_transform(df_X)
    # X = pd.DataFrame(X_scaled, columns=df_X.columns)  # Convert back to DataFrame

    y = df['OREB']  # Replace with your actual target


    # Define the pipeline
    pipeline = Pipeline([
        ('feature_selection', SelectKBest(score_func=f_regression)),  # Select top K features
        ('model', LinearRegression())  # Linear regression model
    ])

    # Define the grid search parameters (testing different numbers of features)
    param_grid = {
        'feature_selection__k': [5, 10, 15,20, 25, 30, 40]  # Test different numbers of selected features
    }

    # Perform grid search
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2')  # Using R² score
    grid_search.fit(X, y)

    best_k = grid_search.best_params_['feature_selection__k']

    # Fit the SelectKBest with the best k
    X = pd.DataFrame(X, columns=df_X.columns)  # Convert back to DataFrame
    selector = SelectKBest(score_func=f_regression, k=best_k)
    selector.fit(X, y)

    # Get the names of the selected columns (features)
    selected_columns = X.columns[selector.get_support()]
    print("Selected features:", selected_columns)

    # Print best parameters
    print("Best number of features:", grid_search.best_params_)
    print("Best R² score:", grid_search.best_score_)

D:/nba_player_csv_historic/season_2022-23/all_quarters/Alex Caruso_content.csv
D:/nba_defense_history_csv/defense_csv_2022-23/all_quarter_defense_content.csv
D:/nba_player_csv_historic/season_2023-24/all_quarters/Alex Caruso_content.csv
D:/nba_defense_history_csv/defense_csv_2023-24/all_quarter_defense_content.csv
D:/nba_player_csv_historic/season_2024-25/all_quarters/Alex Caruso_content.csv
D:/nba_defense_history_csv/defense_csv_2024-25/all_quarter_defense_content.csv
D:/nba_player_csv_historic/season_2022-23/all_quarters/Isaiah Hartenstein_content.csv
D:/nba_defense_history_csv/defense_csv_2022-23/all_quarter_defense_content.csv
D:/nba_player_csv_historic/season_2023-24/all_quarters/Isaiah Hartenstein_content.csv
D:/nba_defense_history_csv/defense_csv_2023-24/all_quarter_defense_content.csv
D:/nba_player_csv_historic/season_2024-25/all_quarters/Isaiah Hartenstein_content.csv
D:/nba_defense_history_csv/defense_csv_2024-25/all_quarter_defense_content.csv
D:/nba_player_csv_historic/seas

In [75]:
from data_functions import his_usage_team

player_names = {'Chris Paul': 'SAS',"De'Aaron Fox": 'SAS', "Devin Vassell": 'SAS',"Harrison Barnes": 'SAS'}
date_list = ["2022-23","2023-24","2024-25"]
usage_path = "D:/nba_usage_csv_historic/usage_csv_{date}/{date}_content.csv"
schedule_base_path = "D:/nba_scheduled_csv/schedule_csv_2025/{schedule_team}_schedule_content.csv"
player_base_path = "D:/nba_player_csv_historic/season_{date}/all_quarters/{player}_content.csv"
defense_base_path = "D:/nba_defense_history_csv/defense_csv_{date}/all_quarter_defense_content.csv"


player_df, current_defense_stat = his_usage_team(player_names, date_list, usage_path, player_base_path, defense_base_path)



for player, df in player_df.items():
    print(player)


    from sklearn.linear_model import Lasso
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import train_test_split
    import numpy as np

    df_X = df.drop(columns=['FGA','PTS','Date','Matchup','Team','Home/Away_game','W/L', 'Away', 'season', 'TEAM', 'season_defense'])



    scaler = StandardScaler()
    X = scaler.fit_transform(df_X)

    y = df['FGA']  # Replace with your actual target


    # Define the grid search parameters for Lasso (L1 regularization)
    param_grid = {
        'alpha': [0.001, 0.01, 0.1, 1, 10]  # Different levels of regularization strength
    }

    # Perform grid search
    grid_search = GridSearchCV(Lasso(), param_grid, cv=5, scoring='r2')
    grid_search.fit(X, y)

    # Print best parameters
    print("Best alpha:", grid_search.best_params_)

    X = pd.DataFrame(X, columns=df_X.columns)  # Convert back to DataFrame


    # Get the best alpha
    best_alpha = grid_search.best_params_['alpha']

    # Fit Lasso with the best alpha
    best_lasso = Lasso(alpha=best_alpha)
    best_lasso.fit(X, y)


    # Get selected (non-zero) feature indices
    selected_features = X.columns[best_lasso.coef_ != 0]
    print("Selected features:", selected_features)

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # display(y_test)

    timestamp = int(pd.Timestamp('2024-12-31').timestamp())
    train_data = df[df['Date_in_Seconds'] < timestamp]  # Replace '2023-01-01' with the corresponding timestamp
    test_data =  df[df['Date_in_Seconds'] >= timestamp]  # Replace '2023-01-01' with the corresponding timestamp

    X_train = train_data[selected_features]
    y_train = train_data['FGM']
    X_test = test_data[selected_features]
    y_test = test_data['FGM']



    # Reduce X to selected features only
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]


    # Retrain Lasso only on selected features
    final_lasso = Lasso(alpha=best_alpha)
    final_lasso.fit(X_train_selected, y_train)

    # Predict on the test set using the reduced feature set
    y_pred = final_lasso.predict(X_test_selected)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print("Final RMSE using selected features:", rmse)
    display(df.head(3))


D:/nba_player_csv_historic/season_2022-23/all_quarters/Chris Paul_content.csv
D:/nba_defense_history_csv/defense_csv_2022-23/all_quarter_defense_content.csv
D:/nba_player_csv_historic/season_2023-24/all_quarters/Chris Paul_content.csv
D:/nba_defense_history_csv/defense_csv_2023-24/all_quarter_defense_content.csv
D:/nba_player_csv_historic/season_2024-25/all_quarters/Chris Paul_content.csv
D:/nba_defense_history_csv/defense_csv_2024-25/all_quarter_defense_content.csv
D:/nba_player_csv_historic/season_2022-23/all_quarters/De'Aaron Fox_content.csv
D:/nba_defense_history_csv/defense_csv_2022-23/all_quarter_defense_content.csv
D:/nba_player_csv_historic/season_2023-24/all_quarters/De'Aaron Fox_content.csv
D:/nba_defense_history_csv/defense_csv_2023-24/all_quarter_defense_content.csv
D:/nba_player_csv_historic/season_2024-25/all_quarters/De'Aaron Fox_content.csv
D:/nba_defense_history_csv/defense_csv_2024-25/all_quarter_defense_content.csv
D:/nba_player_csv_historic/season_2022-23/all_quarte

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,Date,Matchup,Team,Away,Home/Away_game,W/L,MIN_x,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-,season,RANK,TEAM,GP,W,L,MIN_y,OffRtg,DefRtg,NetRtg,AST%,AST/TO,ASTRatio,OREB%,DREB%,REB%,TOV%,eFG%,TS%,PACE,PIE,POSS,season_defense,USG,team_pace,team_offrtg,team_poss,Date_in_Seconds,home_away
0,2022-11-07,PHX @ PHI,PHX,PHI,Away,L,13.45,2,1,1,100.0,0,0,0.0,0,0,0.0,0,2,2,2,2,0,2,1,-10,2022-23,3,PHI,82,54,28,3976.0,117.0,112.7,4.4,61.6,1.84,18.6,25.8,72.1,49.6,13.9,56.3,60.8,97.44,52.4,8072,2022-23,19.5,102.07,109.7,8451,1667779200,1
1,2022-12-07,PHX vs. BOS,PHX,BOS,Home,L,24.03,4,2,6,33.3,0,2,0.0,0,0,0.0,0,4,4,4,1,0,4,3,-28,2022-23,2,BOS,82,57,25,3996.0,117.3,110.6,6.7,63.2,2.0,19.1,25.5,74.6,50.6,13.3,56.6,60.0,99.15,53.4,8247,2022-23,19.5,102.07,109.7,8451,1670371200,0
2,2022-12-09,PHX @ NOP,PHX,NOP,Away,L,34.05,24,7,14,50.0,6,9,66.7,4,4,100.0,0,8,8,7,0,0,3,3,3,2022-23,15,NOP,82,42,40,3971.0,113.8,112.0,1.9,61.6,1.77,18.5,29.0,73.2,51.2,14.5,54.3,58.2,99.58,51.4,8239,2022-23,19.5,102.07,109.7,8451,1670544000,1


De'Aaron Fox


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best alpha: {'alpha': 0.1}
Selected features: Index(['MIN_x', 'FGM', 'FG%', '3PA', 'FTA', 'OREB', 'REB', 'STL', 'TOV', 'DefRtg', 'DREB%', 'eFG%', 'PACE', 'Date_in_Seconds'], dtype='object')
Final RMSE using selected features: 0.034155059990451105


  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,Date,Matchup,Team,Away,Home/Away_game,W/L,MIN_x,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-,season,RANK,TEAM,GP,W,L,MIN_y,OffRtg,DefRtg,NetRtg,AST%,AST/TO,ASTRatio,OREB%,DREB%,REB%,TOV%,eFG%,TS%,PACE,PIE,POSS,season_defense,USG,team_pace,team_offrtg,team_poss,Date_in_Seconds,home_away
0,2022-12-14,SAC @ TOR,SAC,TOR,Away,W,41.26,27,9,23,39.1,4,8,50.0,5,7,71.4,0,6,6,10,1,0,3,4,6,2022-23,17,TOR,82,41,41,3961.0,114.6,113.1,1.5,57.1,2.05,17.2,31.6,72.1,50.2,11.8,51.7,55.5,97.85,49.6,8076,2022-23,28.8,102.07,109.7,8451,1670976000,1
1,2022-12-16,SAC @ DET,SAC,DET,Away,W,39.21,24,9,19,47.4,0,5,0.0,6,9,66.7,0,5,5,9,1,0,3,4,19,2022-23,30,DET,82,17,65,3961.0,109.9,117.8,-7.9,58.1,1.52,16.7,29.2,70.2,49.1,15.0,52.0,56.1,99.88,44.9,8231,2022-23,28.8,102.07,109.7,8451,1671148800,1
2,2022-12-19,SAC vs. CHA,SAC,CHA,Home,L,33.43,37,14,25,56.0,2,5,40.0,7,9,77.8,1,4,5,2,0,0,2,5,-20,2022-23,27,CHA,82,27,55,3966.0,108.4,114.7,-6.3,60.9,1.77,17.8,27.6,71.7,49.1,13.9,51.6,55.1,101.47,47.3,8390,2022-23,28.8,102.07,109.7,8451,1671408000,0


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Devin Vassell
Best alpha: {'alpha': 0.01}
Selected features: Index(['MIN_x', 'FGM', 'FG%', '3PM', '3PA', '3P%', 'FTA', 'FT%', 'OREB', 'REB', 'STL', 'BLK', 'TOV', '+/-', 'W', 'MIN_y', 'DefRtg', 'NetRtg', 'ASTRatio', 'OREB%', 'DREB%', 'eFG%', 'PACE', 'POSS', 'Date_in_Seconds', 'home_away'], dtype='object')
Final RMSE using selected features: 0.007222206874160418


Unnamed: 0,Date,Matchup,Team,Away,Home/Away_game,W/L,MIN_x,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-,season,RANK,TEAM,GP,W,L,MIN_y,OffRtg,DefRtg,NetRtg,AST%,AST/TO,ASTRatio,OREB%,DREB%,REB%,TOV%,eFG%,TS%,PACE,PIE,POSS,season_defense,USG,team_pace,team_offrtg,team_poss,Date_in_Seconds,home_away
0,2022-10-19,SAS vs. CHA,SAS,CHA,Home,L,28.35,11,3,15,20.0,0,8,0.0,5,8,62.5,1,1,2,4,1,0,2,1,-22,2022-23,27,CHA,82,27,55,3966.0,108.4,114.7,-6.3,60.9,1.77,17.8,27.6,71.7,49.1,13.9,51.6,55.1,101.47,47.3,8390,2022-23,24.0,102.07,109.7,8451,1666137600,0
1,2022-10-21,SAS @ IND,SAS,IND,Away,W,33.45,23,7,15,46.7,4,7,57.1,5,6,83.3,0,5,5,4,2,0,0,1,-9,2022-23,23,IND,82,35,47,3951.0,113.8,117.1,-3.3,64.3,1.81,18.9,28.5,68.5,48.2,14.6,54.5,58.1,101.68,48.2,8377,2022-23,24.0,102.07,109.7,8451,1666310400,1
2,2022-10-22,SAS @ PHI,SAS,PHI,Away,W,34.13,22,9,14,64.3,4,8,50.0,0,0,0.0,0,4,4,3,1,0,1,1,0,2022-23,3,PHI,82,54,28,3976.0,117.0,112.7,4.4,61.6,1.84,18.6,25.8,72.1,49.6,13.9,56.3,60.8,97.44,52.4,8072,2022-23,24.0,102.07,109.7,8451,1666396800,1


Harrison Barnes


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best alpha: {'alpha': 0.01}
Selected features: Index(['MIN_x', 'FGM', 'FG%', '3PM', '3PA', '3P%', 'FTA', 'FT%', 'REB', 'STL', 'BLK', 'PF', 'RANK', 'W', 'L', 'AST%', 'OREB%', 'DREB%', 'TOV%', 'eFG%', 'PACE', 'PIE', 'Date_in_Seconds', 'home_away'], dtype='object')
Final RMSE using selected features: 0.005621564929716534


Unnamed: 0,Date,Matchup,Team,Away,Home/Away_game,W/L,MIN_x,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-,season,RANK,TEAM,GP,W,L,MIN_y,OffRtg,DefRtg,NetRtg,AST%,AST/TO,ASTRatio,OREB%,DREB%,REB%,TOV%,eFG%,TS%,PACE,PIE,POSS,season_defense,USG,team_pace,team_offrtg,team_poss,Date_in_Seconds,home_away
0,2022-12-28,SAC vs. DEN,SAC,DEN,Home,W,23.48,8,2,8,25.0,0,2,0.0,4,6,66.7,2,0,2,2,1,0,1,1,-13,2022-23,4,DEN,82,53,29,3951.0,116.8,113.5,3.3,66.3,1.99,20.5,28.9,72.7,51.4,14.7,57.3,60.1,98.74,52.5,8129,2022-23,16.8,102.07,109.7,8451,1672185600,0
1,2022-12-30,SAC vs. UTA,SAC,UTA,Home,W,31.39,10,3,10,30.0,1,4,25.0,3,4,75.0,1,2,3,1,2,0,0,1,0,2022-23,22,UTA,82,37,45,3961.0,115.3,116.0,-0.7,61.1,1.69,18.2,30.8,71.1,51.2,15.1,54.7,58.4,101.02,49.8,8329,2022-23,16.8,102.07,109.7,8451,1672358400,0
2,2023-01-01,SAC @ MEM,SAC,MEM,Away,L,28.32,16,4,12,33.3,4,9,44.4,4,4,100.0,0,5,5,1,1,0,2,0,-2,2022-23,5,MEM,82,51,31,3956.0,114.7,110.7,4.0,59.6,1.91,18.2,30.2,71.1,50.7,13.3,54.0,57.0,101.5,51.9,8360,2022-23,16.8,102.07,109.7,8451,1672531200,1
