In [2]:
"""
INVESTIGATE MONEYPUCK DATA FRESHNESS
Quick check to understand what season we actually have
"""

import pandas as pd
from datetime import datetime

def investigate_moneypuck_dating(csv_path):
    """
    Deep dive into MoneyPuck data to understand:
    1. What season is it really?
    2. How recent is it?
    3. Do we need to fetch new data?
    """
    print("=" * 60)
    print("MONEYPUCK DATA DATING INVESTIGATION")
    print("=" * 60)
    
    df = pd.read_csv(csv_path)
    
    # 1. Check season column
    print("\nüìÖ SEASON VALUES:")
    print(df['season'].value_counts())
    
    # 2. Check games played distribution
    print("\nüèí GAMES PLAYED DISTRIBUTION:")
    print(df['games_played'].describe())
    print(f"\nMax games played: {df['games_played'].max()}")
    print(f"Players with >30 games: {(df['games_played'] > 30).sum()}")
    
    # 3. Current NHL season info
    current_date = datetime.now()
    print(f"\nüìÜ TODAY'S DATE: {current_date.strftime('%Y-%m-%d')}")
    
    # NHL season typically:
    # 2024-25 season: Started Oct 2024, ~38 games by late Dec 2024
    # 2025-26 season: Would start Oct 2025, ~38 games by late Dec 2025
    
    if current_date.month >= 10:  # October or later
        current_season_start = current_date.year
    else:  # Before October
        current_season_start = current_date.year - 1
    
    current_season = f"{current_season_start}-{str(current_season_start + 1)[-2:]}"
    
    print(f"üìä CURRENT NHL SEASON: {current_season}")
    
    # 4. Interpret MoneyPuck season value
    moneypuck_season = df['season'].iloc[0]
    
    print(f"\nüîç MONEYPUCK INTERPRETATION:")
    print(f"  MoneyPuck 'season' value: {moneypuck_season}")
    
    # MoneyPuck uses different formats over the years
    # Common formats: 2024, 20242025, 2024-25
    if moneypuck_season == 2025:
        likely_season = "2024-25"
        print(f"  ‚ö†Ô∏è This is likely: {likely_season} season")
        print(f"  ‚ö†Ô∏è NOT current season ({current_season})")
    elif moneypuck_season == 2026 or moneypuck_season == 20252026:
        likely_season = "2025-26"
        print(f"  ‚úÖ This appears to be: {likely_season} season")
        print(f"  ‚úÖ Current season!")
    
    # 5. Check if data is complete or in-progress
    avg_games = df['games_played'].mean()
    max_games = df['games_played'].max()
    
    print(f"\nüìà DATA COMPLETENESS:")
    print(f"  Average games played: {avg_games:.1f}")
    print(f"  Max games played: {max_games}")
    
    if max_games < 40:
        print(f"  ‚ö†Ô∏è Season in progress (~{max_games} games completed)")
    elif max_games > 70:
        print(f"  ‚úÖ Season complete or near end")
    
    # 6. Sample some known players to verify
    print(f"\nüë§ SAMPLE PLAYERS TO VERIFY:")
    known_players = ['Connor McDavid', 'Auston Matthews', 'Nathan MacKinnon']
    
    for player_name in known_players:
        player_data = df[df['name'].str.contains(player_name, case=False, na=False)]
        if not player_data.empty:
            player = player_data.iloc[0]
            print(f"\n  {player['name']}:")
            print(f"    Team: {player['team']}")
            print(f"    Games: {player['games_played']}")
            print(f"    Goals: {player['I_F_goals']}")
            print(f"    xG: {player['I_F_xGoals']:.2f}")
    
    # 7. Check MoneyPuck website for current season availability
    print(f"\n" + "=" * 60)
    print("üåê CHECK MONEYPUCK WEBSITE:")
    print("=" * 60)
    print(f"\n1. Visit: https://moneypuck.com/data.htm")
    print(f"2. Look for: 'Season: {current_season}' data")
    print(f"3. Download link usually labeled with season year")
    
    # 8. Recommendation
    print(f"\n" + "=" * 60)
    print("üí° RECOMMENDATION:")
    print("=" * 60)
    
    if moneypuck_season == 2025 and current_season == "2025-26":
        print(f"\n‚ùå YOUR DATA IS FROM LAST SEASON (2024-25)")
        print(f"‚úÖ YOU NEED: 2025-26 season data")
        print(f"\nOPTIONS:")
        print(f"  1. Check MoneyPuck for 2025-26 data")
        print(f"  2. Use NHL API for current season (run the scraper)")
        print(f"  3. Wait for MoneyPuck to update (they lag ~1 day)")
        return False, "2024-25"
    else:
        print(f"\n‚úÖ Data appears current for {current_season}")
        print(f"‚úÖ Safe to use this data")
        return True, current_season

# ============================================================
# ALSO CHECK: What's available on MoneyPuck right now
# ============================================================

def check_moneypuck_downloads():
    """
    Check what seasons MoneyPuck currently has available
    """
    import requests
    from bs4 import BeautifulSoup
    
    print("\n" + "=" * 60)
    print("CHECKING MONEYPUCK AVAILABLE DOWNLOADS")
    print("=" * 60)
    
    try:
        url = "https://moneypuck.com/data.htm"
        response = requests.get(url)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find download links
            links = soup.find_all('a', href=True)
            
            print("\nüì• Available data files:")
            for link in links:
                href = link['href']
                if 'skaters' in href.lower() or 'players' in href.lower():
                    if any(year in href for year in ['2024', '2025', '2026']):
                        print(f"  - {link.get_text().strip()}: {href}")
        else:
            print("‚ö†Ô∏è Could not access MoneyPuck website")
            print("Check manually at: https://moneypuck.com/data.htm")
    
    except Exception as e:
        print(f"‚ö†Ô∏è Error checking MoneyPuck: {e}")
        print("Check manually at: https://moneypuck.com/data.htm")

# ============================================================
# RUN IT
# ============================================================

if __name__ == "__main__":
    MONEYPUCK_CSV = "../data/raw/moneypuck_skaters_current.csv"  # UPDATE THIS
    
    is_current, season = investigate_moneypuck_dating(MONEYPUCK_CSV)
    
    # Try to check MoneyPuck website
    check_moneypuck_downloads()
    
    print("\n" + "üéØ" * 30)
    print("NEXT STEPS:")
    print("üéØ" * 30)
    
    if not is_current:
        print(f"\n1. Go to: https://moneypuck.com/data.htm")
        print(f"2. Download 2025-26 season data (if available)")
        print(f"3. If not available yet, run NHL API scraper")
    else:
        print(f"\n‚úÖ Your data is current - proceed with model!")

MONEYPUCK DATA DATING INVESTIGATION

üìÖ SEASON VALUES:
season
2025    4135
Name: count, dtype: int64

üèí GAMES PLAYED DISTRIBUTION:
count    4135.000000
mean       24.898428
std        11.720080
min         1.000000
25%        16.000000
50%        29.000000
75%        35.000000
max        38.000000
Name: games_played, dtype: float64

Max games played: 38
Players with >30 games: 1965

üìÜ TODAY'S DATE: 2025-12-23
üìä CURRENT NHL SEASON: 2025-26

üîç MONEYPUCK INTERPRETATION:
  MoneyPuck 'season' value: 2025
  ‚ö†Ô∏è This is likely: 2024-25 season
  ‚ö†Ô∏è NOT current season (2025-26)

üìà DATA COMPLETENESS:
  Average games played: 24.9
  Max games played: 38
  ‚ö†Ô∏è Season in progress (~38 games completed)

üë§ SAMPLE PLAYERS TO VERIFY:

  Connor McDavid:
    Team: EDM
    Games: 37
    Goals: 4.0
    xG: 3.89

  Auston Matthews:
    Team: TOR
    Games: 30
    Goals: 4.0
    xG: 2.85

  Nathan MacKinnon:
    Team: COL
    Games: 35
    Goals: 6.0
    xG: 3.91

üåê CHECK MONE

In [3]:
# Run this to see if update is needed
import pandas as pd

mp = pd.read_csv('data/raw/skaters.csv')
mp_all = mp[mp['situation'] == 'all']

mcdavid = mp_all[mp_all['name'].str.contains('McDavid', case=False)].iloc[0]
print(f"McDavid games in MoneyPuck: {mcdavid['games_played']}")
print(f"Check NHL.com for current GP")
print(f"If difference > 3 games ‚Üí Download fresh data")
print(f"If difference ‚â§ 3 games ‚Üí Current data is fine")

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/skaters.csv'