In [None]:
from data_functions import his_usage_team
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

def select_features(player_names, date_list, usage_path, player_base_path, defense_base_path):
    player_df, _ = his_usage_team(player_names, date_list, usage_path, player_base_path, defense_base_path)
    
    selected_features_dict = {}
    
    for player, df in player_df.items():
        df_X = df.drop(columns=['PTS','Date','Matchup','Team','Home/Away_game','W/L', 'Away', 'season', 'TEAM', 'season_defense'])
        
        scaler = StandardScaler()
        X = scaler.fit_transform(df_X)
        y = df['PTS']  # Target variable
        
        # Grid search parameters for Lasso
        param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10]}
        grid_search = GridSearchCV(Lasso(), param_grid, cv=5, scoring='r2')
        grid_search.fit(X, y)
        
        # Get the best alpha and fit Lasso
        best_alpha = grid_search.best_params_['alpha']
        best_lasso = Lasso(alpha=best_alpha)
        best_lasso.fit(X, y)
        
        # Select non-zero coefficient features
        X = pd.DataFrame(X, columns=df_X.columns)
        selected_features = X.columns[best_lasso.coef_ != 0].tolist()
        selected_features_dict[player] = selected_features
        
    return selected_features_dict


player_names = {
    "Jayson Tatum": "BOS",
    # "Nikola Jokic": "DEN",
    "Jamal Murray": "DEN",
    "Jaylen Brown": "BOS",
    "Derrick White": "BOS",
    "Payton Pritchard": "BOS",
    "Michael Porter Jr.": "DEN",
    "Russell Westbrook": "DEN",
    "Christian Braun": "DEN",
    "Al Horford": "BOS",
    # "Julian Strawther": "DEN",
    "Sam Hauser": "BOS",
    "Zeke Nnaji": "DEN",
    "Luke Kornet": "BOS"
}
date_list = ["2022-23","2023-24","2024-25"]
usage_path = "D:/nba_usage_csv_historic/usage_csv_{date}/{date}_content.csv"
schedule_base_path = "D:/nba_scheduled_csv/schedule_csv_2025/{schedule_team}_schedule_content.csv"
player_base_path = "D:/nba_player_csv_historic/season_{date}/all_quarters/{player}_content.csv"
defense_base_path = "D:/nba_defense_history_csv/defense_csv_{date}/all_quarter_defense_content.csv"


select_features(player_names, date_list, usage_path, player_base_path, defense_base_path)


In [None]:
from data_functions import his_usage_team
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
import numpy as np
from sklearn.preprocessing import StandardScaler
import pandas as pd

{ 'Alex Caruso':'OKC', 'Isaiah Hartenstein':'OKC', 'Shai Gilgeous-Alexander':'OKC'}
{'Chris Paul': 'SAS',"De'Aaron Fox": 'SAS', "Devin Vassell": 'SAS',"Harrison Barnes": 'SAS'}

player_names = {"Payton Pritchard": 'BOS'}
date_list = ["2022-23","2023-24","2024-25"]
usage_path = "D:/nba_usage_csv_historic/usage_csv_{date}/{date}_content.csv"
schedule_base_path = "D:/nba_scheduled_csv/schedule_csv_2025/{schedule_team}_schedule_content.csv"
player_base_path = "D:/nba_player_csv_historic/season_{date}/all_quarters/{player}_content.csv"
defense_base_path = "D:/nba_defense_history_csv/defense_csv_{date}/all_quarter_defense_content.csv"


player_df, current_defense_stat = his_usage_team(player_names, date_list, usage_path, player_base_path, defense_base_path)



for player, df in player_df.items():
    print(player)


    from sklearn.linear_model import Lasso
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import train_test_split
    import numpy as np

    df_X = df.drop(columns=['PTS','Date','Matchup','Team','Home/Away_game','W/L', 'Away', 'season', 'TEAM', 'season_defense'])



    scaler = StandardScaler()
    X = scaler.fit_transform(df_X)

    y = df['PTS']  # Replace with your actual target


    # Define the grid search parameters for Lasso (L1 regularization)
    param_grid = {
        'alpha': [0.001, 0.01, 0.1, 1, 10]  # Different levels of regularization strength
    }

    # Perform grid search
    grid_search = GridSearchCV(Lasso(), param_grid, cv=5, scoring='r2')
    grid_search.fit(X, y)

    # Print best parameters
    print("Best alpha:", grid_search.best_params_)

    X = pd.DataFrame(X, columns=df_X.columns)  # Convert back to DataFrame


    # Get the best alpha
    best_alpha = grid_search.best_params_['alpha']

    # Fit Lasso with the best alpha
    best_lasso = Lasso(alpha=best_alpha)
    best_lasso.fit(X, y)


    # Get selected (non-zero) feature indices
    selected_features = X.columns[best_lasso.coef_ != 0]
    print("Selected features:", selected_features)

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # display(y_test)

    timestamp = int(pd.Timestamp('2024-12-31').timestamp())
    train_data = df[df['Date_in_Seconds'] < timestamp]  # Replace '2023-01-01' with the corresponding timestamp
    test_data =  df[df['Date_in_Seconds'] >= timestamp]  # Replace '2023-01-01' with the corresponding timestamp

    X_train = train_data[selected_features]
    y_train = train_data['FGM']
    X_test = test_data[selected_features]
    y_test = test_data['FGM']



    # Reduce X to selected features only
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]


    # Retrain Lasso only on selected features
    final_lasso = Lasso(alpha=best_alpha)
    final_lasso.fit(X_train_selected, y_train)

    # Predict on the test set using the reduced feature set
    y_pred = final_lasso.predict(X_test_selected)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print("Final RMSE using selected features:", rmse)


In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
import pandas as pd
from data_functions import his_usage_team

def select_features(player_names, date_list, usage_path, player_base_path, defense_base_path, target):
    player_df, _ = his_usage_team(player_names, date_list, usage_path, player_base_path, defense_base_path)
    
    selected_features_dict = {}
    
    max_features_player = None
    max_features = 0

    for player, df in player_df.items():
        df_X = df.drop(columns=[target,'Date','Matchup','Team','Home/Away_game','W/L', 'Away', 'season', 'TEAM', 'season_defense'])
        
        scaler = StandardScaler()
        X = scaler.fit_transform(df_X)
        y = df[target]  # Target variable
        
        # Grid search parameters for Lasso
        param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10]}
        grid_search = GridSearchCV(Lasso(), param_grid, cv=5, scoring='r2')
        grid_search.fit(X, y)
        
        # Get the best alpha and fit Lasso
        best_alpha = grid_search.best_params_['alpha']
        best_lasso = Lasso(alpha=best_alpha)
        best_lasso.fit(X, y)
        
        # Select non-zero coefficient features
        X = pd.DataFrame(X, columns=df_X.columns)
        selected_features = X.columns[best_lasso.coef_ != 0].tolist()
        
        # Store selected features
        selected_features_dict[player] = selected_features
        
        # Track the player with the most features
        if len(selected_features) > max_features:
            max_features = len(selected_features)
            max_features_player = player

    # If a player has no selected features, assign the features of the player with the most features
    for player in selected_features_dict:
        if not selected_features_dict[player]:  # If empty
            selected_features_dict[player] = selected_features_dict.get(max_features_player, [])

    return selected_features_dict


player_names = {
    "Devin Booker": "PHX",
    "Anthony Edwards": "MIN",
    "Kevin Durant": "PHX",
    "Naz Reid": "MIN",
    "Julius Randle": "MIN",
    "Bradley Beal": "PHX",
    "Bol Bol": "PHX",
    "Donte DiVincenzo": "MIN",
    "Jaden McDaniels": "MIN",
    "Nick Richards": "PHX",
    "Mike Conley": "MIN"
}

date_list = ["2022-23","2023-24","2024-25"]
usage_path = "D:/nba_usage_csv_historic/usage_csv_{date}/{date}_content.csv"
schedule_base_path = "D:/nba_scheduled_csv/schedule_csv_2025/{schedule_team}_schedule_content.csv"
player_base_path = "D:/nba_player_csv_historic/season_{date}/all_quarters/{player}_content.csv"
defense_base_path = "D:/nba_defense_history_csv/defense_csv_{date}/all_quarter_defense_content.csv"


feature_dic = select_features(player_names, date_list, usage_path, player_base_path, defense_base_path,'AST')

for player, features in feature_dic.items():
    # print(player)
    features = feature_dic[player] 
    print(player,':',features)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Devin Booker : ['MIN_x', 'PTS', 'FG%', '3PM', 'FTM', 'FTA', 'STL', 'BLK', '+/-', 'MIN_y', 'DefRtg', 'AST%', 'ASTRatio', 'DREB%', 'REB%', 'Date_in_Seconds']
Anthony Edwards : ['MIN_x', 'FG%', '3P%', 'FT%', 'DREB', 'STL', 'BLK', 'TOV', '+/-', 'L', 'DefRtg', 'AST%', 'OREB%', 'DREB%', 'REB%', 'TOV%', 'home_away']
Kevin Durant : ['MIN_x', 'FG%', '3P%', 'FT%', 'DREB', 'STL', 'BLK', 'TOV', '+/-', 'L', 'DefRtg', 'AST%', 'OREB%', 'DREB%', 'REB%', 'TOV%', 'home_away']
Naz Reid : ['MIN_x', '3PA', '3P%', 'DREB', 'REB', 'TOV', '+/-', 'GP', 'OffRtg', 'AST/TO']
Julius Randle : ['MIN_x', 'FG%', '3P%', 'FT%', 'DREB', 'STL', 'BLK', 'TOV', '+/-', 'L', 'DefRtg', 'AST%', 'OREB%', 'DREB%', 'REB%', 'TOV%', 'home_away']
Bradley Beal : ['MIN_x', 'PTS', 'FGM', 'FG%', 'OREB', 'STL', 'BLK', 'TOV', 'PF', '+/-', 'L', 'DefRtg', 'TOV%', 'PACE', 'POSS', 'home_away']
Bol Bol : ['MIN_x', 'DREB', 'STL', '+/-', 'team_pace', 'Date_in_Seconds']
Donte DiVincenzo : ['MIN_x', '3P%', 'FTA', 'DREB', 'STL', 'BLK', '+/-', 'DefRtg'

In [None]:
from data_functions import his_usage_team
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

def select_features(player_names, date_list, usage_path, player_base_path, defense_base_path, target):
    player_df, _ = his_usage_team(player_names, date_list, usage_path, player_base_path, defense_base_path)
    
    selected_features_dict = {}
    
    for player, df in player_df.items():
        df_X = df.drop(columns=[target,'Date','Matchup','Team','Home/Away_game','W/L', 'Away', 'season', 'TEAM', 'season_defense'])
        
        scaler = StandardScaler()
        X = scaler.fit_transform(df_X)
        y = df[target]  # Target variable
        
        # Grid search parameters for Lasso
        param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10]}
        grid_search = GridSearchCV(Lasso(), param_grid, cv=5, scoring='r2')
        grid_search.fit(X, y)
        
        # Get the best alpha and fit Lasso
        best_alpha = grid_search.best_params_['alpha']
        best_lasso = Lasso(alpha=best_alpha)
        best_lasso.fit(X, y)
        
        # Select non-zero coefficient features
        X = pd.DataFrame(X, columns=df_X.columns)
        selected_features = X.columns[best_lasso.coef_ != 0].tolist()
        selected_features_dict[player] = selected_features
        
    return selected_features_dict


player_names = {
    "Devin Booker": "PHX",
    "Anthony Edwards": "MIN",
    "Kevin Durant": "PHX",
    "Naz Reid": "MIN",
    "Julius Randle": "MIN",
    "Bradley Beal": "PHX",
    "Bol Bol": "PHX",
    "Donte DiVincenzo": "MIN",
    "Jaden McDaniels": "MIN",
    "Nick Richards": "PHX",
    "Mike Conley": "MIN"
}
date_list = ["2022-23","2023-24","2024-25"]
usage_path = "D:/nba_usage_csv_historic/usage_csv_{date}/{date}_content.csv"
schedule_base_path = "D:/nba_scheduled_csv/schedule_csv_2025/{schedule_team}_schedule_content.csv"
player_base_path = "D:/nba_player_csv_historic/season_{date}/all_quarters/{player}_content.csv"
defense_base_path = "D:/nba_defense_history_csv/defense_csv_{date}/all_quarter_defense_content.csv"


feature_dic = select_features(player_names, date_list, usage_path, player_base_path, defense_base_path,'AST')

for player, features in feature_dic.items():
    # print(player)
    features = feature_dic[player] 
    print(player,':',features)

In [None]:
# features for shai
# 'MIN_x', 'FGM', 'FG%', '3PA', 'OREB', 'REB', 'RANK'

from feature_function import his_usage_team, select_features
from data_functions import his_player_defense_data, current_player_defense_data, build_data_path
import pandas as pd
import numpy as np
from IPython.display import display


# player_names = {"Payton Pritchard": 'BOS'}

player_names = {
    "Mark Williams": "CHA",
    # "Miles Bridges": "CHA",
    # "Jimmy Butler": "GSW",
    "LaMelo Ball": "CHA",
    "Stephen Curry": "GSW",
    # "Brandin Podziemski": "GSW",
    # "Nick Smith Jr.": "CHA",
    "Josh Green": "CHA",
    "Moses Moody": "GSW",
    "Draymond Green": "GSW",
    "Paul George": "PHI",
    "Tyrese Maxey": "PHI",
    "Kelly Oubre Jr.": "PHI",
    "Andre Drummond": "PHI",
    "Shaedon Sharpe": "POR",
    "Quentin Grimes": "PHI",
    "Deni Avdija": "POR",
    "Anfernee Simons": "POR",
    # "Toumani Camara": "POR",
    # "Donovan Clingan": "POR",
    "Bam Adebayo": "MIA",
    "Tyler Herro": "MIA",
    "Andrew Wiggins": "MIA",
    # "Bilal Coulibaly": "WAS",
    # "Alexandre Sarr": "WAS",
    "Khris Middleton": "WAS",
    # "Carlton Carrington": "WAS",
    "Davion Mitchell": "MIA",
    # "Kyshawn George": "WAS",
    "Haywood Highsmith": "MIA"
}


date_list = ["2022-23","2023-24","2024-25"]
usage_path = "D:/nba_usage_csv_historic/usage_csv_{date}/{date}_content.csv"
schedule_base_path = "D:/nba_scheduled_csv/schedule_csv_2025/{schedule_team}_schedule_content.csv"
player_base_path = "D:/nba_player_csv_historic/season_{date}/all_quarters/{player}_content.csv"
defense_base_path = "D:/nba_defense_history_csv/defense_csv_{date}/all_quarter_defense_content.csv"


def prediction(player_names: dict, date_list: list, usage_path, player_base_path, defense_base_path, schedule_base_path,selected_feature_target, prediction_target):
    fga_prediction_data, df_defense = his_usage_team(player_names, date_list, usage_path, player_base_path, defense_base_path)
    fga_prediction_results = {}

    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import cross_val_score
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    import numpy as np
    from sklearn.preprocessing import StandardScaler
    import pandas

    # this is the function that will be used to select the features for the model prints out the player and the best features to use
    feature_dic = select_features(player_names, date_list, usage_path, player_base_path, defense_base_path,selected_feature_target)




    for player, team in player_names.items():
        # Get schedule data for the player's team
        schedule_path = build_data_path(schedule_base_path, schedule_team=team)
        schedule_df = pd.read_csv(schedule_path)
        
        # Get player-specific prediction data

        df = fga_prediction_data[player]
        


        features = feature_dic[player] 
        # print(features)
        target = prediction_target

        timestamp = int(pd.Timestamp('2025-02-').timestamp())
        train_data = df[df['Date_in_Seconds'] < timestamp]
        test_data = df[df['Date_in_Seconds'] >= timestamp]

        display(test_data

        X_train = train_data[features].fillna(0)
        y_train = train_data[target].fillna(0)
        X_test = test_data[features].fillna(0)
        y_test = test_data[target].fillna(0) 

        model = LinearRegression()
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)


        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)

        print(f"{player}, MAE: {mae}, RMSE: {rmse}")

        # EWMA calculation for minutes
        alpha = 0.2
        df['EWMA_MIN'] = df['MIN_x'].ewm(span=(2/alpha - 1), adjust=False).mean()
        last_actual = df['MIN_x'].iloc[-1]
        last_smoothed = df['EWMA_MIN'].iloc[-1]
        next_value = alpha * last_actual + (1 - alpha) * last_smoothed
        next_value = round(next_value, 2)


        # Exclude columns
        exclude_features = ['RANK', 'OffRtg', 'W', 'L', 
            'DefRtg','NetRtg', 'AST%', 'AST/TO', 'ASTRatio',
            'OREB%', 'DREB%', 'REB%', 'TOV%' , 'eFG%', 'TS%', 'PACE',
            'POSS', 'TEAM', 'PIE']
        
        exclude_features_schedule = ['home_away', 'schedule_team', 'DATE', 'location', 'season_defense']

        # Get defensive stats for the scheduled team
        schedule_df['Date_in_Seconds'] = pd.to_datetime(schedule_df['DATE']).astype('int64') // 10**9
        schedule_df['home_away'] = schedule_df['location'].apply(lambda x: 1 if x == 'away' else 0)

        # df_defense = df_defense.merge(schedule_df, left_on='TEAM', right_on = 'schedule_team', how='outer', suffixes=('', '_DROP'))
        # df_defense = df_defense.drop(columns=[col for col in df_defense.columns if col.endswith('_DROP')])
        # df_defense= df_defense.fillna(0)


        # display(df_for_schedule)

        


        last_season = df_defense["season_defense"].iloc[-1]
        df_for_schedule = df_defense.loc[df_defense["season_defense"] == last_season, exclude_features]

        # display(df_for_schedule)


        first_team = schedule_df['schedule_team'].iloc[0]


        schedule_team_result = schedule_df.loc[schedule_df['schedule_team'] == first_team, 'schedule_team'].values[0]


        schedule_values = {feature: df_for_schedule.loc[df_for_schedule['TEAM'] == schedule_team_result, feature].values[0] 
                for feature in exclude_features if feature in df_for_schedule.columns}

        print(schedule_values)


        # checking to see if the schedule values are being passed correctly
        
        rolling_features = [col for col in features  if col not in exclude_features]
        # display(rolling_features)

        # df = df[df['Date_in_Seconds'] >= timestamp]
        # # display(df)

        for col in rolling_features:
            df[f'{col}'] = df[col].rolling(window=20).mean().fillna(0).astype(int)

        df_last_rolling = df.iloc[[-1]][[f'{col}' for col in rolling_features]]

        df_last_rolling = df_last_rolling.reset_index(drop=True)

        for value in features:
            if value in exclude_features:
                print(f'This is the value {value}')
                df_last_rolling[value] = schedule_values.get(value)
                # df_last_rolling[value] = schedule_df[value].iloc[0]

        for value in features:
            if value in exclude_features_schedule:
                print(f'This is the value {value}')
                # df_last_rolling[value] = schedule_values.get(value)
                df_last_rolling[value] = schedule_df[value].iloc[0]        



        df_last_rolling = df_last_rolling.reindex(columns=features)

        X_future = df_last_rolling

        display(X_future)

        future_predictions = model.predict(X_future).astype('int')
        fga_prediction_results[player] = [future_predictions[0].round(1), rmse]
        df_results = pd.DataFrame.from_dict(fga_prediction_results, orient='index', columns=[target, 'RMSE'])
        # Reset index and rename it properly
        df_results.reset_index(inplace=True)
        df_results.rename(columns={'index': 'Player'}, inplace=True)
        # display(df_results)


    return   df_results

results = prediction(player_names, date_list, usage_path, player_base_path, defense_base_path, schedule_base_path,'PTS','PTS')


# for player, fga_predictions in results.items():
#     print(fga_predictions)

display(results)


In [1]:
# features for shai
# 'MIN_x', 'FGM', 'FG%', '3PA', 'OREB', 'REB', 'RANK'

from feature_function import his_usage_team
from data_functions import his_player_defense_data, current_player_defense_data, build_data_path
import pandas as pd
import numpy as np
from IPython.display import display


player_names ={"Jaylen Brown": 'BOS'}
date_list = ["2022-23","2023-24","2024-25"]
usage_path = "D:/nba_usage_csv_historic/usage_csv_{date}/{date}_content.csv"
schedule_base_path = "D:/nba_scheduled_csv/schedule_csv_2025/{schedule_team}_schedule_content.csv"
player_base_path = "D:/nba_player_csv_historic/season_{date}/all_quarters/{player}_content.csv"
defense_base_path = "D:/nba_defense_history_csv/defense_csv_{date}/all_quarter_defense_content.csv"


def fga_prediction(player_names: dict, date_list: list, usage_path, player_base_path, defense_base_path, schedule_base_path):
    fga_prediction_data, df_defense = his_usage_team(player_names, date_list, usage_path, player_base_path, defense_base_path)
    fga_prediction_results = {}

    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import cross_val_score
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    import numpy as np
    from sklearn.preprocessing import StandardScaler
    import pandas




    for player, team in player_names.items():
        # Get schedule data for the player's team
        schedule_path = build_data_path(schedule_base_path, schedule_team=team)
        schedule_df = pd.read_csv(schedule_path)
        
        # Get player-specific prediction data

        df = fga_prediction_data[player]


        features = selected_features 
        print(features)
        target = 'FGA'

        timestamp = int(pd.Timestamp('2024-12-31').timestamp())
        train_data = df[df['Date_in_Seconds'] < timestamp]
        test_data = df[df['Date_in_Seconds'] >= timestamp]

        display(test_data)

        X_train = train_data[features]
        y_train = train_data[target]
        X_test = test_data[features]
        y_test = test_data[target]

        # # Initialize Scaler
        # scaler = StandardScaler()
        # # Transform Data
        # scaled_data_x = scaler.fit_transform(X_train)
        # scaled_data_y = scaler.fit_transform(y_train.values.reshape(-1, 1))

        # X_train = pd.DataFrame(scaled_data_x, columns=X_train.columns)     
        # y_train = pd.DataFrame(scaled_data_y, columns=['features'])   

        model = LinearRegression()
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)


        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)

        print(f"{player}, MAE: {mae}, RMSE: {rmse}")

        # EWMA calculation for minutes
        alpha = 0.2
        df['EWMA_MIN'] = df['MIN_x'].ewm(span=(2/alpha - 1), adjust=False).mean()
        last_actual = df['MIN_x'].iloc[-1]
        last_smoothed = df['EWMA_MIN'].iloc[-1]
        next_value = alpha * last_actual + (1 - alpha) * last_smoothed
        next_value = round(next_value, 2)


        # Exclude columns
        exclude_features = ['RANK', 'OffRtg', 'W', 'L', 
            'DefRtg','NetRtg', 'AST%', 'AST/TO', 'ASTRatio',
            'OREB%', 'DREB%', 'REB%', 'TOV%' , 'eFG%', 'TS%', 'PACE',
            'POSS', 'TEAM', 'home_away', 'PIE']

        # Get defensive stats for the scheduled team
        schedule_df['Date_in_Seconds'] = pd.to_datetime(schedule_df['DATE']).astype('int64') // 10**9
        schedule_df['home_away'] = schedule_df['location'].apply(lambda x: 1 if x == 'away' else 0)

        df_defense = df_defense.merge(schedule_df, left_on='TEAM', right_on = 'schedule_team', how='outer')

        # display(df_for_schedule)

        


        last_season = df_defense["season_defense"].iloc[-1]
        df_for_schedule = df_defense.loc[df_defense["season_defense"] == last_season, exclude_features]

        # display(df_for_schedule)


        first_team = schedule_df['schedule_team'].iloc[0]
        # could refactor to use squeeze but simpler to read this way
        schedule_team_result = schedule_df.loc[schedule_df['schedule_team'] == first_team, 'schedule_team'].values[0]


        schedule_values = {feature: df_for_schedule.loc[df_for_schedule['TEAM'] == schedule_team_result, feature].values[0] 
                   for feature in exclude_features if feature in df_for_schedule.columns}

        print(schedule_values)


        # checking to see if 
        
        rolling_features = [col for col in selected_features  if col not in exclude_features]

        for col in rolling_features:
            df[f'{col}'] = df[col].rolling(window=10).mean()

        df_last_rolling = df.iloc[[-1]][[f'{col}' for col in rolling_features]]

        df_last_rolling = df_last_rolling.reset_index(drop=True)

        for value in selected_features:
            if value in exclude_features:
                print(f'This is the value {value}')
                df_last_rolling[value] = schedule_values.get(value)



        df_last_rolling = df_last_rolling.reindex(columns=selected_features)


        X_future = df_last_rolling

        display(X_future)

        future_predictions = model.predict(X_future)
        fga_prediction_results[player] = player , future_predictions[0].round(1)

    return fga_prediction_results

results = fga_prediction(player_names, date_list, usage_path, player_base_path, defense_base_path, schedule_base_path)


for player, fga_predictions in results.items():
    print(fga_predictions)


NameError: name 'selected_features' is not defined

In [None]:
from feature_function import fga_prediction


SAS = {'Chris Paul': 'SAS',"De'Aaron Fox": 'SAS', "Devin Vassell": 'SAS',"Harrison Barnes": 'SAS'}
OKC = { 'Alex Caruso':'OKC', 'Isaiah Hartenstein':'OKC', 'Shai Gilgeous-Alexander':'OKC'}

player_names = SAS
date_list = ["2022-23","2023-24","2024-25"]
usage_path = "D:/nba_usage_csv_historic/usage_csv_{date}/{date}_content.csv"
schedule_base_path = "D:/nba_scheduled_csv/schedule_csv_2025/{schedule_team}_schedule_content.csv"
player_base_path = "D:/nba_player_csv_historic/season_{date}/all_quarters/{player}_content.csv"
defense_base_path = "D:/nba_defense_history_csv/defense_csv_{date}/all_quarter_defense_content.csv"

results = fga_prediction(player_names, date_list, usage_path, player_base_path, defense_base_path, schedule_base_path)


for player, fga_predictions in results.items():
    print(fga_predictions)


In [None]:
def build_data_path(base_path, **Kwargs):
    # Replace placeholders with actual values
    for key, value in Kwargs.items():
        base_path=base_path.replace(f"{{{key}}}", str(value))
    return base_path

base_path = "/data/{year}/{month}/{day}/file.txt"
kwargs = {'year': 2025, 'month': '02', 'day': '10'}
year = 2025
build_data_path(base_path, year=year)

In [None]:
import pandas as pd

schedule_df = pd.read_csv("D:/nba_scheduled_csv/schedule_csv_2025/OKC_schedule_content.csv")

schedule_df

In [None]:
def fga_prediction(player_names: dict, date_list: list, usage_path, player_base_path, defense_base_path, schedule_base_path):
    fga_prediction_data, df_defense = his_usage_team(player_names, date_list,usage_path,player_base_path, defense_base_path)

    fga_prediction_results = {}

    for player, team in player_names.items():

        schedule_path = build_data_path(schedule_base_path, schedule_team=team)
        schedule_df = pd.read_csv(schedule_path)
        
        # print(schedule_df)

    for player, df in fga_prediction_data.items():

        from sklearn.model_selection import train_test_split
        from sklearn.ensemble import RandomForestRegressor
        from sklearn.metrics import mean_squared_error
        from sklearn.model_selection import cross_val_score
        from sklearn.linear_model import LinearRegression
        from sklearn.metrics import mean_absolute_error, mean_squared_error
        import numpy as np



        features = ['PACE', 'team_pace', 'USG', 'DefRtg','MIN_x', 'home_away', 'Date_in_Seconds','OffRtg', 'team_offrtg']
        target = 'FGA'


        # Continue with your existing operations
        timestamp = int(pd.Timestamp('2024-12-31').timestamp())
        train_data = df[df['Date_in_Seconds'] < timestamp]  # Replace '2023-01-01' with the corresponding timestamp
        test_data =  df[df['Date_in_Seconds'] >= timestamp]  # Replace '2023-01-01' with the corresponding timestamp


        X_train = train_data[features]
        y_train = train_data[target]
        X_test = test_data[features]
        y_test = test_data[target]


        model = LinearRegression()
        model.fit(X_train,y_train)

        y_pred = model.predict(X_test)

        ###### predicting Minutes ##########
        alpha = 0.2

        df['EWMA_MIN'] = df['MIN_x'].ewm(span=(2/alpha - 1), adjust=False).mean()

        last_actual = df['MIN_x'].iloc[-1]  # Last known FGA
        last_smoothed = df['EWMA_MIN'].iloc[-1]  # Last smoothed value
        next_value = alpha * last_actual + (1 - alpha) * last_smoothed
        next_value = next_value.round(2)


        ##################################################################

        last_season = df_defense["season_defense"].iloc[-1] 
        df_for_schedule = df_defense.loc[df_defense["season_defense"] == last_season, ['TEAM','PACE', 'DefRtg', 'OffRtg']]

        # display(df_for_schedule)


        first_team = schedule_df['schedule_team'].iloc[0]
        # print(first_team)
        schedule_team_result = schedule_df.loc[schedule_df['schedule_team'] == first_team, 'schedule_team'].values[0]
        # print(schedule_team_result)

        schedule_defrtg = df_for_schedule.loc[df_for_schedule['TEAM'] == schedule_team_result, 'DefRtg'].values[0]

        schedule_pace = df_for_schedule.loc[df_for_schedule['TEAM'] == schedule_team_result, 'PACE'].values[0]

        schedule_offrtg = df_for_schedule.loc[df_for_schedule['TEAM'] == schedule_team_result, 'OffRtg'].values[0]

        # print("dfrtg:",schedule_defrtg)
        # print("pace",schedule_pace)
        # print("offrtg",schedule_offrtg)

        # This is to turn the first date in schedule into seconds
        schedule_df['Date_in_Seconds'] = pd.to_datetime(schedule_df['DATE']).astype('int64') // 10**9

        # This is to turn the home and away games into a 1 or a zero -> away is 1 and anything is zero
        schedule_df['home_away'] = schedule_df['location'].apply(lambda x: 1 if x == 'away' else 0)


        

        # display(schedule_df)





        X_future = pd.DataFrame({
        # 'EWMA_FGA_2': [next_value], 
        'PACE':[schedule_pace], 
        'team_pace':[df['team_pace'].iloc[-1]], 
        'USG':[df['USG'].iloc[-1]],
        'DefRtg':[schedule_defrtg],
        'MIN_x':[next_value], 
        'home_away':[schedule_df['home_away'].iloc[0]],
        'Date_in_Seconds':[schedule_df['Date_in_Seconds'].iloc[0]],
        'OffRtg':[df['OffRtg'].iloc[-1]],
        'team_offrtg': [schedule_offrtg]
        })

        display(X_future)

        future_predictions = model.predict(X_future)



        # print("this is future",future_predictions)

        fga_prediction_results.update({player:future_predictions[0]})

        
    



    return fga_prediction_results

In [None]:
from data_functions import his_player_defense_data, current_player_defense_data, build_data_path
import pandas as pd
import numpy as np
from IPython.display import display


def his_usage_team(player_names: dict, date_list: list, usage_path,player_base_path,defense_base_path):
    current_player_dic = {}

    for player, team in player_names.items():
        current_player_frames =[]

        for date in date_list:
            usage_path =build_data_path(usage_path,date=date)
            usage_data = pd.read_csv(usage_path)

            #merging player and defense dat into one
            merged_data, current_defense_df = his_player_defense_data(player_base_path,defense_base_path,player,date)

            #adding season to usage_data
            usage_data['season'] = date

            #Getting the player usage percentage for usage data and adding to merge
            player_usage = usage_data.loc[usage_data['Player'] == player, 'USG%'].values[0]
            merged_data['USG'] = player_usage

            #adding the current player team pace
            team_stat = current_defense_df.loc[current_defense_df['TEAM'] == team, 'PACE'].values[0]
            merged_data["team_pace"] = team_stat

            # adding current player team OffRtg
            team_offrtg = current_defense_df.loc[current_defense_df['TEAM'] == team, 'OffRtg'].values[0]
            merged_data["team_offrtg"] = team_offrtg

            team_poss = current_defense_df.loc[current_defense_df['TEAM'] == team, 'POSS'].values[0]
            merged_data["team_poss"] = team_poss
            
            # Exclude rows where the TEAM column matches the given team
            merged_data = merged_data[merged_data['TEAM'] != team]


            # merged_data = merged_data[['season','Date', 'Home/Away_game' ,'Matchup' ,'PTS','MIN_x', 'Team', 'TEAM', 'FGA', 'USG', 'DefRtg', 'PACE','team_pace']]

            # Turn date into seconds
            merged_data['Date_in_Seconds'] = pd.to_datetime(merged_data['Date']).astype('int64') // 10**9
            merged_data = merged_data.sort_values(by="Date_in_Seconds")


            # Turn Home/Away game into 1 and 0
            merged_data['home_away'] = merged_data['Home/Away_game'].apply(lambda x: 1 if x == 'Away' else 0)
            # Dropping duplicates
            merged_data = merged_data.drop_duplicates()
            
            # Append the DataFrame for this date to the player's list
            current_player_frames.append(merged_data)

        # Combine all dates for the current player into one DataFrame
        current_player_dic[player] = pd.concat(current_player_frames, ignore_index=True)


    return current_player_dic, current_defense_df


player_names = { 'Alex Caruso':'OKC', 'Isaiah Hartenstein':'OKC', 'Shai Gilgeous-Alexander':'OKC'}
date_list = ["2022-23","2023-24","2024-25"]
usage_path = "D:/nba_usage_csv_historic/usage_csv_{date}/{date}_content.csv"
schedule_base_path = "D:/nba_scheduled_csv/schedule_csv_2025/{schedule_team}_schedule_content.csv"
player_base_path = "D:/nba_player_csv_historic/season_{date}/all_quarters/{player}_content.csv"
defense_base_path = "D:/nba_defense_history_csv/defense_csv_{date}/all_quarter_defense_content.csv"


def fga_prediction(player_names: dict, date_list: list, usage_path, player_base_path, defense_base_path, schedule_base_path):
    fga_prediction_data, df_defense = his_usage_team(player_names, date_list, usage_path, player_base_path, defense_base_path)
    fga_prediction_results = {}

    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import cross_val_score
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    import numpy as np
    from sklearn.preprocessing import StandardScaler




    for player, team in player_names.items():
        # Get schedule data for the player's team
        schedule_path = build_data_path(schedule_base_path, schedule_team=team)
        schedule_df = pd.read_csv(schedule_path)
        
        # Get player-specific prediction data

        df = fga_prediction_data[player]





        
        # Model training and prediction code
        features = ['PACE', 'team_pace', 'USG', 'DefRtg', 'MIN_x', 'home_away', 'Date_in_Seconds', 'OffRtg', 'team_offrtg']
        target = 'FGA'

        timestamp = int(pd.Timestamp('2024-12-31').timestamp())
        train_data = df[df['Date_in_Seconds'] < timestamp]
        test_data = df[df['Date_in_Seconds'] >= timestamp]

        X_train = train_data[features]
        y_train = train_data[target]
        X_test = test_data[features]
        y_test = test_data[target]

        # Initialize Scaler
        scaler = StandardScaler()
        # Transform Data
        scaled_data_x = scaler.fit_transform(X_train)
        scaled_data_y = scaler.fit_transform(y_train.values.reshape(-1, 1))

        X_train = pd.DataFrame(scaled_data_x, columns=X_train.columns)     
        y_train = pd.DataFrame(scaled_data_y, columns=['features'])   

        model = LinearRegression()
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)


        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)

        print(f"{player}, MAE: {mae}, RMSE: {rmse}")

        # EWMA calculation for minutes
        alpha = 0.2
        df['EWMA_MIN'] = df['MIN_x'].ewm(span=(2/alpha - 1), adjust=False).mean()
        last_actual = df['MIN_x'].iloc[-1]
        last_smoothed = df['EWMA_MIN'].iloc[-1]
        next_value = alpha * last_actual + (1 - alpha) * last_smoothed
        next_value = round(next_value, 2)

        # Get defensive stats for the scheduled team
        last_season = df_defense["season_defense"].iloc[-1]
        df_for_schedule = df_defense.loc[df_defense["season_defense"] == last_season, ['TEAM', 'PACE', 'DefRtg', 'OffRtg']]

        first_team = schedule_df['schedule_team'].iloc[0]
        schedule_team_result = schedule_df.loc[schedule_df['schedule_team'] == first_team, 'schedule_team'].values[0]
        schedule_defrtg = df_for_schedule.loc[df_for_schedule['TEAM'] == schedule_team_result, 'DefRtg'].values[0]
        schedule_pace = df_for_schedule.loc[df_for_schedule['TEAM'] == schedule_team_result, 'PACE'].values[0]
        schedule_offrtg = df_for_schedule.loc[df_for_schedule['TEAM'] == schedule_team_result, 'OffRtg'].values[0]

        # Convert schedule dates to seconds
        schedule_df['Date_in_Seconds'] = pd.to_datetime(schedule_df['DATE']).astype('int64') // 10**9
        schedule_df['home_away'] = schedule_df['location'].apply(lambda x: 1 if x == 'away' else 0)

        # Create future prediction dataframe
        X_future = pd.DataFrame({
            'PACE': [schedule_pace],
            'team_pace': [df['team_pace'].iloc[-1]],
            'USG': [df['USG'].iloc[-1]],
            'DefRtg': [schedule_defrtg],
            'MIN_x': [next_value],
            'home_away': [schedule_df['home_away'].iloc[0]],
            'Date_in_Seconds': [schedule_df['Date_in_Seconds'].iloc[0]],
            'OffRtg': [df['OffRtg'].iloc[-1]],
            'team_offrtg': [schedule_offrtg]
        })

        future_predictions = model.predict(X_future)
        fga_prediction_results[player] = future_predictions[0]

    return fga_prediction_results

results = fga_prediction(player_names, date_list, usage_path, player_base_path, defense_base_path, schedule_base_path)


for player, fga_predictions in results.items():
    print(fga_predictions)
   

            

            

            

            



            


In [None]:
import data_functions as data_functions
import pandas as pd
import numpy as np
import importlib
importlib.invalidate_caches()
from data_functions import his_player_defense_data, current_player_defense_data

#"D:\nba_player_csv_current\season_2024-25\all_quarters\Alex Caruso_content.csv"
#"D:\nba_defense_csv_current\defense_csv_2024-25\all_quarter_defense_content.csv"
#"D:\nba_scheduled_csv\schedule_csv_2025\ATL_schedule_content.csv"

player_names = {'Shai Gilgeous-Alexander':'OKC', 'Alex Caruso':'OKC', 'Isaiah Hartenstein':'OKC'}
date_list = ["2022-23","2023-24","2024-25"]
schedule_base_path = "D:/nba_scheduled_csv/schedule_csv_2025/{schedule_team}_schedule_content.csv"
player_base_path = "D:/nba_player_csv_historic/season_{date}/all_quarters/{player}_content.csv"
defense_base_path = "D:/nba_defense_history_csv/defense_csv_{date}/all_quarter_defense_content.csv"
# usage_data = f"D:/nba_usage_csv_current/usage_csv_{date}/{date}_content.csv"



current_player_dic = {}

for player, team in player_names.items():
  current_player_frames = []

  for date in date_list:
    usage_data = pd.read_csv(f"D:/nba_usage_csv_historic/usage_csv_{date}/{date}_content.csv")  

    # use current_player function to to merge datat
    merged_data, current_player_defense = his_player_defense_data(player_base_path,defense_base_path,player,date)

    # Add the season column to usage_data
    usage_data['season'] = date
    
    # Get the player's usage percentage from the usage data
    # (Assumes that there is exactly one matching row)
    player_usage = usage_data.loc[usage_data['Player'] == player, 'USG%'].values[0]
    merged_data['USG'] = player_usage


    # adding current player team pace
    team_stat = current_player_defense.loc[current_player_defense['TEAM'] == team, 'PACE'].values[0]
    merged_data["team_pace"] = team_stat

    # display(current_player_defense)

    # adding current player team OffRtg
    team_offrtg = current_player_defense.loc[current_player_defense['TEAM'] == team, 'OffRtg'].values[0]
    merged_data["team_offrtg"] = team_offrtg

    team_poss = current_player_defense.loc[current_player_defense['TEAM']== team, 'POSS'].values[0]
    merged_data["team_poss"] = team_poss

    # Exclude rows where the TEAM column matches the given team
    merged_data = merged_data[merged_data['TEAM'] != team]
    # display(merged_data.head(5))

    # merged_data = merged_data[['season','Date', 'Home/Away_game' ,'Matchup' ,'PTS','MIN_x', 'Team', 'TEAM', 'FGA', 'USG', 'DefRtg', 'PACE','team_pace']]

    # Turn date into seconds
    merged_data['Date_in_Seconds'] = pd.to_datetime(merged_data['Date']).astype('int64') // 10**9
    merged_data = merged_data.sort_values(by="Date_in_Seconds")

    # Turn Home/Away game into 1 and 0
    merged_data['home_away'] = merged_data['Home/Away_game'].apply(lambda x: 1 if x == 'Away' else 0)
    # Dropping duplicates
    merged_data = merged_data.drop_duplicates()
    
    # Append the DataFrame for this date to the player's list
    current_player_frames.append(merged_data)

  # Combine all dates for the current player into one DataFrame
  current_player_dic[player] = pd.concat(current_player_frames, ignore_index=True)



specific_player = 'Shai Gilgeous-Alexander'
for player, df in current_player_dic.items():

    print(f"\nData for {player}:")
    df['FGA_rolling_3'] = df['FGA'].rolling(window=3).mean()



    alpha = 0.2

    df['EWMA_FGA'] = df['FGA'].ewm(span=(2/(1-alpha)-1), adjust=False).mean()

    df['EWMA_FGA_2'] = df['FGA'].ewm(span=(2/alpha - 1), adjust=False).mean()

    # alpha = 0.2  # Example smoothing factor
    df['Exp_smooth'] = 21  # Initialize column

    for i in range(1, len(df)):
        df.loc[i, 'Exp_smooth'] = alpha * df.loc[i, 'FGA'] + (1 - alpha) * df.loc[i - 1, 'Exp_smooth']

    df_act = df

    # display(df_act)

    # display(df)
    df = df.head(149)

    value_fga_list = []
    moving_average_list = []
    next_next_value_list = []
    date_list = []
    
    for value_fga, moving_average, date in zip(df['FGA'],df['EWMA_FGA_2'],df['Date']):
       value_fga_list.append(value_fga)
       moving_average_list.append(moving_average)
       #print("this is actual:",value_fga, "this is last_predicted",moving_average)
       next_next_value = alpha * value_fga + (1 - alpha) * moving_average
       next_next_value_list.append(next_next_value)
       date_list.append(date)
       #print("this is next:",next_next_value)
       dataframe_dic= {'Date':date_list,'Actual_FGA': value_fga_list, 'Moving_average':moving_average_list, 'Next':next_next_value_list}
       dataframe = pd.DataFrame(dataframe_dic)
      #  dataframe['Next'] = dataframe['Next'].shift(1)

    # display(dataframe)


    # Predict the next (11th) value
    last_actual = df['FGA'].iloc[-1]  # Last known FGA
    last_smoothed = df['EWMA_FGA_2'].iloc[-1]  # Last smoothed value
    next_value = alpha * last_actual + (1 - alpha) * last_smoothed


    # Print and add the prediction
    print(f"Predicted value: {next_value}")

    # display(df)

    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import cross_val_score
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_absolute_error, mean_squared_error



    features = ['PACE', 'team_pace', 'USG', 'DefRtg','MIN_x', 'home_away', 'Date_in_Seconds','OffRtg', 'team_offrtg']
    target = 'FGA'


     # Continue with your existing operations
    timestamp = int(pd.Timestamp('2024-12-31').timestamp())
    train_data = df[df['Date_in_Seconds'] < timestamp]  # Replace '2023-01-01' with the corresponding timestamp
    test_data =  df[df['Date_in_Seconds'] >= timestamp]  # Replace '2023-01-01' with the corresponding timestamp


    X_train = train_data[features]
    y_train = train_data[target]
    X_test = test_data[features]
    y_test = test_data[target]


    model = LinearRegression()
    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)

    # print(y_pred)

     # Compare predictions with actual points
    predicted_vs_actual = pd.DataFrame({
        "Date": test_data['Date'],  
        "Matchup": test_data['Matchup'],  
        "Predicted Points": y_pred.round(1), 
        "Actual Points": y_test}).reset_index(drop=True)
    
    display(predicted_vs_actual)

    X_future = pd.DataFrame({
    # 'EWMA_FGA_2': [next_value], 
    'PACE':[df['PACE'].iloc[-1]], 
    'team_pace':[df['team_pace'].iloc[-1]], 
    'USG':[df['USG'].iloc[-1]],
    'DefRtg':[df['DefRtg'].iloc[-1]],
    'MIN_x':[30.09], 
    'home_away':[1],
    'Date_in_Seconds':[1738368000],
    'OffRtg':[df['OffRtg'].iloc[-1]],
    'team_offrtg': [df['team_offrtg'].iloc[-1]]
    })
    
    future_predictions = model.predict(X_future)

    print("this is future",future_predictions)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    print(f"MAE: {mae}, RMSE: {rmse}")

    import numpy as np

    # Drop NaN values (first row of 'Next' is NaN due to shift)
    rmse_df = dataframe.dropna(subset=['Actual_FGA', 'Next'])

    # Compute RMSE
    rmse = np.sqrt(((rmse_df['Actual_FGA'] - rmse_df['Next']) ** 2).mean())

    print("RMSE:", rmse)


    # Drop NaN values (due to shift)
    comparison_df = dataframe.dropna(subset=['Actual_FGA', 'Next'])

    # Total number of valid comparisons
    total = len(comparison_df)

    # Count occurrences
    higher_count = (comparison_df['Next'] > comparison_df['Actual_FGA']).sum()
    lower_count = (comparison_df['Next'] < comparison_df['Actual_FGA']).sum()
    equal_count = (comparison_df['Next'] == comparison_df['Actual_FGA']).sum()

    # Calculate percentages
    higher_percent = (higher_count / total) * 100
    lower_percent = (lower_count / total) * 100
    equal_percent = (equal_count / total) * 100

    # Print results
    print(f"Percentage of predictions HIGHER than actual: {higher_percent:.2f}%")
    print(f"Percentage of predictions LOWER than actual: {lower_percent:.2f}%")
    print(f"Percentage of predictions EQUAL to actual: {equal_percent:.2f}%")


    comparison_df = dataframe.dropna(subset=['Actual_FGA', 'Next'])

    # Filter cases where Next is higher than Actual
    higher_cases = comparison_df[comparison_df['Next'] > comparison_df['Actual_FGA']]
    higher_difference_avg = (higher_cases['Next'] - higher_cases['Actual_FGA']).mean()

    # Filter cases where Next is lower than Actual
    lower_cases = comparison_df[comparison_df['Next'] < comparison_df['Actual_FGA']]
    lower_difference_avg = (lower_cases['Actual_FGA'] - lower_cases['Next']).mean()

    # Print results
    print(f"Average difference when prediction is HIGHER: {higher_difference_avg:.2f}")
    print(f"Average difference when prediction is LOWER: {lower_difference_avg:.2f}")

    # Drop NaN values
    comparison_df = dataframe.dropna(subset=['Actual_FGA', 'Next'])

    # Compute Mean Error (ME)
    mean_error = (comparison_df['Next'] - comparison_df['Actual_FGA']).mean()

    # Print result
    print(f"Mean Error (ME): {mean_error:.2f}")

    