In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
from joblib import load
from sklearn.preprocessing import StandardScaler
from dotenv import load_dotenv
from arize.pandas.logger import Client
from arize.utils.types import ModelTypes, Environments, Schema, Metrics


load_dotenv()


arize_space_key = os.getenv('ARIZE_SPACE_KEY')
arize_api_key = os.getenv('ARIZE_API_KEY')

arize_client = Client(space_key=arize_space_key, api_key=arize_api_key)

def extract_team_nickname(full_name):
    return full_name.split()[-1]

def generate_prediction_id(home_team, visitor_team, game_date):
    return f"{home_team}-{visitor_team}-{game_date}"

def format_features_for_prediction(team_name, team_stats, is_home_game):
    """
    Format the latest stats into the feature array expected by the model.

    :param team_name: Name of the team.
    :param team_stats: DataFrame with the latest stats for the team.
    :param is_home_game: Boolean indicating if the team is playing at home in the upcoming game.
    :return: List of features in the correct order for model prediction.
    """
    if team_stats.empty:
        print(f"Stats DataFrame is empty for team: {team_name}")
        return None

    try:
        # Determine the prefix based on whether the team is playing at home or away
        prefix = 'home' if is_home_game else 'visitor'

        # Extract relevant stats using the appropriate prefix
        features = [
            team_stats[f'ast_{prefix}_10game_avg'],
            team_stats[f'blk_{prefix}_10game_avg'],
            team_stats[f'dreb_{prefix}_10game_avg'],
            team_stats[f'fg3_pct_{prefix}_10game_avg'],
            team_stats[f'fg_pct_{prefix}_10game_avg'],
            team_stats[f'ft_pct_{prefix}_10game_avg'],
            team_stats[f'oreb_{prefix}_10game_avg'],
            team_stats[f'pf_{prefix}_10game_avg'],
            team_stats[f'pts_{prefix}_10game_avg'],
            team_stats[f'reb_{prefix}_10game_avg'],
            team_stats[f'stl_{prefix}_10game_avg'],
            team_stats[f'turnover_{prefix}_10game_avg'],        
            team_stats[f'ast_{prefix}_season_avg'],
            team_stats[f'blk_{prefix}_season_avg'],
            team_stats[f'dreb_{prefix}_season_avg'],
            team_stats[f'fg3_pct_{prefix}_season_avg'],
            team_stats[f'fg_pct_{prefix}_season_avg'],
            team_stats[f'ft_pct_{prefix}_season_avg'],
            team_stats[f'oreb_{prefix}_season_avg'],
            team_stats[f'pf_{prefix}_season_avg'],
            team_stats[f'pts_{prefix}_season_avg'],
            team_stats[f'reb_{prefix}_season_avg'],
            team_stats[f'stl_{prefix}_season_avg'],
            team_stats[f'turnover_{prefix}_season_avg'],        
            team_stats[f'ast_{prefix}_5game_avg'],
            team_stats[f'blk_{prefix}_5game_avg'],
            team_stats[f'dreb_{prefix}_5game_avg'],
            team_stats[f'fg3_pct_{prefix}_5game_avg'],
            team_stats[f'fg_pct_{prefix}_5game_avg'],
            team_stats[f'ft_pct_{prefix}_5game_avg'],
            team_stats[f'oreb_{prefix}_5game_avg'],
            team_stats[f'pf_{prefix}_5game_avg'],
            team_stats[f'pts_{prefix}_5game_avg'],
            team_stats[f'reb_{prefix}_5game_avg'],
            team_stats[f'stl_{prefix}_5game_avg'],
            team_stats[f'turnover_{prefix}_5game_avg'],
            team_stats[f'ast_{prefix}_prev'],
            team_stats[f'blk_{prefix}_prev'],
            team_stats[f'dreb_{prefix}_prev'],
            team_stats[f'fg3_pct_{prefix}_prev'],
            team_stats[f'fg_pct_{prefix}_prev'],
            team_stats[f'ft_pct_{prefix}_prev'],
            team_stats[f'oreb_{prefix}_prev'],
            team_stats[f'pf_{prefix}_prev'],
            team_stats[f'pts_{prefix}_prev'],
            team_stats[f'reb_{prefix}_prev'],
            team_stats[f'stl_{prefix}_prev'],
            team_stats[f'turnover_{prefix}_prev'],
            team_stats[f'blk_{prefix}_opp_avg'],
            team_stats[f'dreb_{prefix}_opp_avg'],
            team_stats[f'fg3_pct_{prefix}_opp_avg'],
            team_stats[f'fg_pct_{prefix}_opp_avg'],
            team_stats[f'ft_pct_{prefix}_opp_avg'],
            team_stats[f'oreb_{prefix}_opp_avg'],
            team_stats[f'pf_{prefix}_opp_avg'],
            team_stats[f'pts_{prefix}_opp_avg'],
            team_stats[f'reb_{prefix}_opp_avg'],
            team_stats[f'stl_{prefix}_opp_avg'],
            team_stats[f'turnover_{prefix}_opp_avg'],
            team_stats[f'elo_{prefix}_team'],
        ]

        return features
    except Exception as e:
        print(f"Error in formatting features for {team_name}: {e}")
        return None




def find_latest_team_data_by_name(team_name, game_stats_df, is_home_team_upcoming):
    team_name = team_name.lower()

    team_games = game_stats_df[
        (game_stats_df['team_name_home'].str.lower().str.contains(team_name)) |
        (game_stats_df['team_name_visitor'].str.lower().str.contains(team_name))
    ]

    if not team_games.empty:
        latest_game_date = team_games['game_date'].max()
        latest_game_data = team_games[team_games['game_date'] == latest_game_date]

        if not latest_game_data.empty:
            home_team_name = latest_game_data.iloc[0]['team_name_home'].lower()
            is_home_team = team_name in home_team_name
            prefix = 'home' if is_home_team else 'visitor'
            rename_prefix = 'home' if is_home_team_upcoming else 'visitor'

            # Fetching the columns and renaming based on whether the team is home or visitor
            stats_columns = [col for col in game_stats_df.columns if f'{prefix}_' in col and ('10game_avg' in col or 'season_avg' in col or '5game_avg' in col or 'elo' in col or 'prev' in col or 'opp_avg' in col)]
            if stats_columns:
                latest_team_stats = latest_game_data[stats_columns].iloc[0]

                # Rename columns by adding the correct prefix
                renamed_stats = latest_team_stats.rename(lambda x: x.replace(f'{prefix}_', f'{rename_prefix}_'))
                
                return pd.DataFrame([renamed_stats])
            else:
                print(f"No stats columns found for prefix: {prefix}")  # Debug print

    print(f"No data found for team: {team_name}")  # Debug print
    return pd.DataFrame()


# Load the NBA schedule, game statistics, and the trained model
schedule_df = pd.read_csv('./nba_schedule.csv')
game_stats_df = pd.read_csv('./game_stats.csv')  # Replace with the correct path
model_pipeline = load('nba_game_predictor_model.joblib')

# Convert date columns to datetime and adjust team names
schedule_df['date'] = pd.to_datetime(schedule_df['date'])
schedule_df['Home Team'] = schedule_df['Home Team'].apply(extract_team_nickname)
schedule_df['Visitor Team'] = schedule_df['Visitor Team'].apply(extract_team_nickname)

# Convert date columns to datetime
schedule_df['date'] = pd.to_datetime(schedule_df['date'])
game_stats_df['game_date'] = pd.to_datetime(game_stats_df['game_date'])

# Filter games for today
today = datetime.now().strftime('%Y-%m-%d')
todays_games = schedule_df[schedule_df['date'] == today]

print(f"Found {len(todays_games)} games today.")
print(todays_games[['date', 'Home Team', 'Visitor Team']])

feature_columns = [
    # 'ast_home_10game_avg', 
    'blk_home_10game_avg', 
    'dreb_home_10game_avg', 
    'fg3_pct_home_10game_avg', 
    'fg_pct_home_10game_avg', 
    'ft_pct_home_10game_avg', 
    'oreb_home_10game_avg', 
    'pf_home_10game_avg', 
    'pts_home_10game_avg', 
    'reb_home_10game_avg', 
    'stl_home_10game_avg', 
    'turnover_home_10game_avg',
    # 'ast_visitor_10game_avg', 
    'blk_visitor_10game_avg', 
    'dreb_visitor_10game_avg', 
    'fg3_pct_visitor_10game_avg', 
    'fg_pct_visitor_10game_avg', 
    'ft_pct_visitor_10game_avg', 
    'oreb_visitor_10game_avg', 
    'pf_visitor_10game_avg', 
    'pts_visitor_10game_avg', 
    'reb_visitor_10game_avg', 
    'stl_visitor_10game_avg', 
    'turnover_visitor_10game_avg',
    # 'ast_home_5game_avg', 
    'blk_home_5game_avg', 
    'dreb_home_5game_avg', 
    'fg3_pct_home_5game_avg', 
    'fg_pct_home_5game_avg', 
    'ft_pct_home_5game_avg', 
    'oreb_home_5game_avg', 
    'pf_home_5game_avg', 
    'pts_home_5game_avg', 
    'reb_home_5game_avg', 
    'stl_home_5game_avg', 
    'turnover_home_5game_avg',
    # 'ast_visitor_5game_avg', 
    'blk_visitor_5game_avg', 
    'dreb_visitor_5game_avg', 
    'fg3_pct_visitor_5game_avg', 
    'fg_pct_visitor_5game_avg', 
    'ft_pct_visitor_5game_avg', 
    'oreb_visitor_5game_avg', 
    'pf_visitor_5game_avg', 
    'pts_visitor_5game_avg', 
    'reb_visitor_5game_avg', 
    'stl_visitor_5game_avg', 
    'turnover_visitor_5game_avg',
    # 'ast_visitor_season_avg', 
    'blk_visitor_season_avg', 
    'dreb_visitor_season_avg', 
    'fg3_pct_visitor_season_avg', 
    'fg_pct_visitor_season_avg', 
    'ft_pct_visitor_season_avg', 
    'oreb_visitor_season_avg', 
    'pf_visitor_season_avg', 
    'pts_visitor_season_avg', 
    'reb_visitor_season_avg', 
    'stl_visitor_season_avg', 
    'turnover_visitor_season_avg',
    'elo_home_team',
    'elo_visitor_team',
     # 'ast_visitor_prev', 
    'blk_visitor_prev', 
    'dreb_visitor_prev', 
    'fg3_pct_visitor_prev', 
    'fg_pct_visitor_prev', 
    'ft_pct_visitor_prev', 
    'oreb_visitor_prev', 
    'pf_visitor_prev', 
    'pts_visitor_prev', 
    'reb_visitor_prev', 
    'stl_visitor_prev', 
    'turnover_visitor_prev',
      # 'ast_home_prev', 
    'blk_home_prev', 
    'dreb_home_prev', 
    'fg3_pct_home_prev', 
    'fg_pct_home_prev', 
    'ft_pct_home_prev', 
    'oreb_home_prev', 
    'pf_home_prev', 
    'pts_home_prev', 
    'reb_home_prev', 
    'stl_home_prev', 
    'turnover_home_prev',
    'blk_home_opp_avg', 
    'dreb_home_opp_avg', 
    'fg3_pct_home_opp_avg', 
    'fg_pct_home_opp_avg', 
    'ft_pct_home_opp_avg', 
    'oreb_home_opp_avg', 
    'pf_home_opp_avg', 
    'pts_home_opp_avg', 
    'reb_home_opp_avg', 
    'stl_home_opp_avg', 
    'turnover_home_opp_avg',
    'blk_visitor_opp_avg', 
    'dreb_visitor_opp_avg', 
    'fg3_pct_visitor_opp_avg', 
    'fg_pct_visitor_opp_avg', 
    'ft_pct_visitor_opp_avg', 
    'oreb_visitor_opp_avg', 
    'pf_visitor_opp_avg', 
    'pts_visitor_opp_avg', 
    'reb_visitor_opp_avg', 
    'stl_visitor_opp_avg', 
    'turnover_visitor_opp_avg',
    # ... make sure this order is exactly the same as in the training dataset
]

arize_schema = Schema(
    prediction_id_column_name="prediction_id",
    prediction_label_column_name="predicted_winner",
    prediction_score_column_name="win_probability",
    feature_column_names=feature_columns,
)

# Prepare features for each game and make predictions
predictions = []
win_probabilities = []
for index, upcoming_game in todays_games.iterrows():
    home_team_name = upcoming_game['Home Team']
    visitor_team_name = upcoming_game['Visitor Team']

    home_team_stats = find_latest_team_data_by_name(home_team_name, game_stats_df, is_home_team_upcoming=True)
    visitor_team_stats = find_latest_team_data_by_name(visitor_team_name, game_stats_df, is_home_team_upcoming=False)

    if home_team_stats is not None and visitor_team_stats is not None:
        # Combine home and visitor features
        game_features = []
        for feature in feature_columns:
            if feature in home_team_stats.columns:
                game_features.append(home_team_stats[feature].iloc[0])
            elif feature in visitor_team_stats.columns:
                game_features.append(visitor_team_stats[feature].iloc[0])
            else:
                game_features.append(np.nan)  # Or a suitable default value

        # Convert to DataFrame with correct column names
        game_features_df = pd.DataFrame([game_features], columns=feature_columns)
        prediction_id = generate_prediction_id(home_team_name, visitor_team_name, upcoming_game['date'])

        # Make prediction
        prediction = model_pipeline.predict(game_features_df)
        predicted_winner = home_team_name if prediction[0] == 1 else visitor_team_name

        # Get winning probabilities
        probabilities = model_pipeline.predict_proba(game_features_df)
        home_team_win_probability = probabilities[0][1]  # Assuming 1 corresponds to the home team winning
        win_probability = home_team_win_probability if prediction[0] == 1 else 1 - home_team_win_probability


        log_df = game_features_df.copy()
        log_df['prediction_id'] = prediction_id
        log_df['predicted_winner'] = predicted_winner
        log_df['win_probability'] = win_probability

        response = arize_client.log(
            dataframe=log_df,
            environment=Environments.PRODUCTION,
            model_id="nick-nba-game-predictor",
            model_version="1.0.0",
            model_type=ModelTypes.BINARY_CLASSIFICATION,
            metrics_validation=[Metrics.CLASSIFICATION, Metrics.AUC_LOG_LOSS],
            validate=True,
            schema=arize_schema
        )

        if response.status_code == 200:
            print(f"✅ Successfully logged data to Arize!")
        else:
            print(
                f'❌ Logging failed with status code {response.status_code} and message "{response.text}"'
            )

        # Append to lists
        predictions.append(predicted_winner)
        win_probabilities.append(win_probability)
    else:
        predictions.append("Unknown")
        win_probabilities.append(np.nan)


# After generating predictions:
todays_games.loc[:, 'predicted_winner'] = predictions
todays_games.loc[:, 'win_probability'] = win_probabilities

# Output the predictions
print(todays_games[['date', 'Home Team', 'Visitor Team', 'predicted_winner', 'win_probability']])

# Define the file path for the predictions.csv
predictions_file = 'predictions.csv'

# Function to ensure consistent date format
def parse_dates(df):
    df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
    return df

# Check if the file exists and read it if it does
if os.path.exists(predictions_file):
    existing_predictions = pd.read_csv(predictions_file)
    existing_predictions = parse_dates(existing_predictions)

    # Ensure todays_games also has consistent date format
    todays_games = parse_dates(todays_games)

    # Merge the new predictions with the existing ones
    combined = pd.concat([existing_predictions, todays_games])
    combined.drop_duplicates(subset=['date', 'Home Team', 'Visitor Team'], keep='last', inplace=True)
    print('hey')
    # Write the updated DataFrame to the file
    combined.to_csv(predictions_file, mode='w', header=True, index=False)
else:
    # If the file doesn't exist, create it with headers
    todays_games = parse_dates(todays_games)
    todays_games.to_csv(predictions_file, mode='w', header=True, index=False)



Found 7 games today.
         date  Home Team  Visitor Team
75 2023-12-14    Celtics     Cavaliers
76 2023-12-14       Heat         Bulls
77 2023-12-14  Mavericks  Timberwolves
78 2023-12-14    Nuggets          Nets
79 2023-12-14    Blazers          Jazz
80 2023-12-14      Kings       Thunder
81 2023-12-14   Clippers      Warriors
[38;21m  arize.utils.logging | INFO | Success! Check out your data at https://app.arize.com/organizations/QWNjb3VudE9yZ2FuaXphdGlvbjox/spaces/U3BhY2U6NTc2Mw==/models/modelName/nick-nba-game-predictor?selectedTab=dataIngestion[0m
✅ Successfully logged data to Arize!
[38;21m  arize.utils.logging | INFO | Success! Check out your data at https://app.arize.com/organizations/QWNjb3VudE9yZ2FuaXphdGlvbjox/spaces/U3BhY2U6NTc2Mw==/models/modelName/nick-nba-game-predictor?selectedTab=dataIngestion[0m
✅ Successfully logged data to Arize!
[38;21m  arize.utils.logging | INFO | Success! Check out your data at https://app.arize.com/organizations/QWNjb3VudE9yZ2FuaXphdGlv

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  todays_games.loc[:, 'predicted_winner'] = predictions
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  todays_games.loc[:, 'win_probability'] = win_probabilities
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
