In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as slt
import schedule
import time
import threading
from nba_api.stats.endpoints import playergamelogs
from nba_api.stats.endpoints import leaguedashteamstats
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
# load & clean data
def load_data():
    try:
        file_path = 'all_mvp_stats.csv'
        mvp_data = pd.read_csv(file_path, skiprows=1)

        # mapping actual column names to expected ones
        column_mapping = {
            "Lg": "League",
            "Tm": "Team",
            "G": "Games_Played",
            "MP": "MPG",
            "WS/48": "WS_per_48"
        }

        # renaming columns
        mvp_data.rename(columns=column_mapping, inplace=True)

        # Ensure missing columns exist
        required_columns = ["Season", "League", "Player", "PTS", "TRB", "AST", "STL", "BLK", "FG%", "3P%", "FT%", "WS", "WS_per_48", "BPM", "PER", "USG%", "TS%", "VORP"]
        for col in required_columns:
            if col not in mvp_data.columns:
                mvp_data[col] = np.nan 

        # Convert numeric columns
        numeric_cols = ["PTS", "TRB", "AST", "STL", "BLK", "FG%", "3P%", "FT%", "WS", "WS_per_48", "BPM", "PER", "USG%", "TS%", "VORP"]
        existing_numeric_cols = [col for col in numeric_cols if col in mvp_data.columns]
        mvp_data[existing_numeric_cols] = mvp_data[existing_numeric_cols].apply(pd.to_numeric, errors='coerce')


        mvp_data["MVP_Score"] = (
            mvp_data["PTS"] * 0.35 +
            mvp_data["AST"] * 0.20 +
            mvp_data["TRB"] * 0.15 + 
            mvp_data["WS"] * 0.2 +
            mvp_data["BPM"] * 0.1
        )

        print("MVP data loaded successfully")
        return mvp_data

    except Exception as e:
        print(f"Error loading all_mvp_stats.csv: {e}")
        return pd.DataFrame()

In [3]:
#get game info
def get_logs():
    game_logs = playergamelogs.PlayerGameLogs(season_nullable = '2024-25').get_data_frames()[0]
    game_logs = game_logs[['PLAYER_NAME', 'GAME_DATE', 'TEAM_NAME', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'MIN']]
    game_logs['GAME_DATE'] = pd.to_datetime(game_logs['GAME_DATE'])
    return game_logs


In [4]:
#process stats
def process_stats(game_logs):
    game_logs['PTS_10G'] = game_logs.groupby('PLAYER_NAME')['PTS'].rolling(10, min_periods=1).mean().reset_index(0, drop=True)
    game_logs['REB_10G'] = game_logs.groupby('PLAYER_NAME')['REB'].rolling(10, min_periods=1).mean().reset_index(0, drop=True)
    game_logs['AST_10G'] = game_logs.groupby('PLAYER_NAME')['AST'].rolling(10, min_periods=1).mean().reset_index(0, drop=True)
    latest_stats = game_logs.sort_values('GAME_DATE').groupby('PLAYER_NAME').last().reset_index()
    return latest_stats

In [5]:
# team standings
def team_standings():
    team_stats = leaguedashteamstats.LeagueDashTeamStats(season='2024-25').get_data_frames()[0]
    #print("Available columns: ", team_stats.columns)
    #if 'TEAM_ABBREVIATION' not in team_stats.columns:
    #    raise KeyError("Column TEAM_ABREVIATION not found. Check avaibale columns above")
    team_stats = team_stats[['TEAM_NAME', 'W', 'L', 'GP']]
    team_stats['WIN_PCT'] = team_stats['W'] / team_stats['GP']
    return team_stats

In [6]:
# merge data
def merge_data(player_stats, team_stats):
    player_stats = player_stats.merge(team_stats, on='TEAM_NAME', how='left')
    player_stats['WIN_PCT'] = player_stats['WIN_PCT'].fillna(0)

    player_stats['MVP_Score'] = (
        player_stats['PTS_10G'] * 0.3 +
        player_stats['REB_10G'] * 0.3 +
        player_stats['AST_10G'] * 0.3 +
        player_stats['WIN_PCT'] * 0.1
    )
    return player_stats

In [7]:
# train predictive model
def train_model(player_stats):
    previous_mvp = load_data()

    if previous_mvp.empty:
        return None, None

    data = player_stats.merge(previous_mvp, left_on='PLAYER_NAME', right_on='Player', how='left').fillna(0)

    # Print available columns before selecting features
    print("Available columns in train_model():", data.columns)

    # Ensure MVP_Score exists and resolve duplicate column issue
    if 'MVP_Score_x' in data.columns:
        data.rename(columns={'MVP_Score_x': 'MVP_Score'}, inplace=True)
    elif 'MVP_Score_y' in data.columns:
        data.rename(columns={'MVP_Score_y': 'MVP_Score'}, inplace=True)
    else:
        raise KeyError("Column 'MVP_Score' not found in train_model(). Check available columns above.")

    # Select the correct feature columns
    X = data[['PTS_10G', 'REB_10G', 'AST_10G', 'MVP_Score']]
    y = data['MVP_Score']

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_scaled, y)

    y_pred = model.predict(X_scaled)
    mae = mean_absolute_error(y, y_pred)
    print(f"Model trained! Mean Absolute Error: {mae:.4f}")

    return model, scaler

In [8]:
# update the predictions
def update_predictions():
    game_logs = get_logs()
    player_stats = process_stats(game_logs)
    team_stats = team_standings()
    merged_data = merge_data(player_stats, team_stats)
    
    model, scaler = train_model(merged_data)

    merged_data.to_csv('latest_mvp_predictions.csv', index=False)
    print("Updated predictions")    

In [9]:
# streamlit dashboard
def dashboard():
    slt.title("NBA MVP Race - Live Predictions")
    latest_mvp = pd.read_csv('latest_mvp_predictions.csv')

    print("All columns in latest_mvp_predictions.csv: ", latest_mvp.columns)
    if 'MVP_Score' not in latest_mvp.columns:
        raise KeyError("Column 'MVP_Score' not found. Check available columns above")

    slt.subheader("Top 10 in the MVP Race")
    slt.dataframe(latest_mvp[['PLAYER_NAME', 'MVP_Score']].sort_values('MVP_Score', ascending=False).head(10))

    player_name = slt.selectbox("Choose a player", latest_mvp['PLAYER_NAME'])
    if slt.button("Show MVP Prediction"):
        player_data = latest_mvp[latest_mvp['PLAYER_NAME'] == player_name].drop(columns=['PLAYER_NAME'])
        model, scaler = train_model(latest_mvp)
        X_scaled = scaler.transform(player_data.drop(columns=['MVP', 'MVP_Score']))
        prediction = model.predict(X_scaled)[0]

        slt.write(f"Predicted MVP Score: {prediction * 100:.2f}")
        slt.write("Chance to win MVP: ", "Yes" if prediction > 0.5 else "No")

In [None]:
if __name__ == "__main__":
    update_predictions()

    schedule.every().day.at("03:00").do(update_predictions)

    dashboard()

    def run_scheduler():
        while True:
            schedule.run_pending()
            time.sleep(60)
    threading.Thread(target=run_scheduler, daemon=True).start()

MVP data loaded successfully
Available columns in train_model(): Index(['PLAYER_NAME', 'GAME_DATE', 'TEAM_NAME', 'PTS_x', 'REB', 'AST_x',
       'STL_x', 'BLK_x', 'MIN', 'PTS_10G', 'REB_10G', 'AST_10G', 'W', 'L',
       'GP', 'WIN_PCT', 'MVP_Score_x', 'Season', 'League', 'Player', 'Voting',
       'Age', 'Team', 'Games_Played', 'MPG', 'PTS_y', 'TRB', 'AST_y', 'STL_y',
       'BLK_y', 'FG%', '3P%', 'FT%', 'WS', 'WS_per_48', 'BPM', 'PER', 'USG%',
       'TS%', 'VORP', 'MVP_Score_y'],
      dtype='object')
Model trained! Mean Absolute Error: 0.0055
Updated predictions


2025-03-17 02:54:40.512 
  command:

    streamlit run /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]


All columns in latest_mvp_predictions.csv:  Index(['PLAYER_NAME', 'GAME_DATE', 'TEAM_NAME', 'PTS', 'REB', 'AST', 'STL',
       'BLK', 'MIN', 'PTS_10G', 'REB_10G', 'AST_10G', 'W', 'L', 'GP',
       'WIN_PCT', 'MVP_Score'],
      dtype='object')


MVP data loaded successfully
Available columns in train_model(): Index(['PLAYER_NAME', 'GAME_DATE', 'TEAM_NAME', 'PTS_x', 'REB', 'AST_x',
       'STL_x', 'BLK_x', 'MIN', 'PTS_10G', 'REB_10G', 'AST_10G', 'W', 'L',
       'GP', 'WIN_PCT', 'MVP_Score_x', 'Season', 'League', 'Player', 'Voting',
       'Age', 'Team', 'Games_Played', 'MPG', 'PTS_y', 'TRB', 'AST_y', 'STL_y',
       'BLK_y', 'FG%', '3P%', 'FT%', 'WS', 'WS_per_48', 'BPM', 'PER', 'USG%',
       'TS%', 'VORP', 'MVP_Score_y'],
      dtype='object')
Model trained! Mean Absolute Error: 0.0055
Updated predictions
