In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [None]:
import statsapi
import pybaseball
from pybaseball import playerid_lookup, statcast_batter, pitching_stats


In [20]:
lookup_table = playerid_lookup('Ohtani', 'Shohei')
print(lookup_table)
player_id = lookup_table.loc[0, 'key_mlbam']
print(f"Player ID for Shohei Ohtani: {player_id}")

  name_last name_first  key_mlbam key_retro  key_bbref  key_fangraphs  \
0    ohtani     shohei     660271  ohtas001  ohtansh01          19755   

   mlb_played_first  mlb_played_last  
0            2018.0           2025.0  
Player ID for Shohei Ohtani: 660271


In [23]:
data = statcast_batter(start_dt='2024-03-01', end_dt='2024-10-31', player_id=player_id)
print(data.head())
print(f"Total at-bats for Shohei Ohtani in 2024: {len(data)}")

Gathering Player Data
  pitch_type   game_date  release_speed  release_pos_x  release_pos_z  \
0         KC  2024-10-30           83.3          -2.52           6.08   
1         CH  2024-10-30           90.3          -2.55           5.93   
2         SI  2024-10-30           95.0          -2.49           6.07   
3         KC  2024-10-30           87.3          -2.74           5.91   
4         FF  2024-10-30           98.3          -2.76           5.93   

      player_name  batter  pitcher     events      description  ...  \
0  Ohtani, Shohei  660271   543037  field_out    hit_into_play  ...   
1  Ohtani, Shohei  660271   543037        NaN             ball  ...   
2  Ohtani, Shohei  660271   543037        NaN    called_strike  ...   
3  Ohtani, Shohei  660271   543037  strikeout  swinging_strike  ...   
4  Ohtani, Shohei  660271   543037        NaN         foul_tip  ...   

   batter_days_until_next_game  api_break_z_with_gravity  api_break_x_arm  \
0                          NaN     

In [33]:
import statsapi
import json

player_id = 660271
season = 2024

# 直接呼叫底層 API，獲取指定球員和賽季的 stats 數據
# 參數說明：statGroups=hitting, sportIds=1(MLB), season=2024
raw_stats = statsapi.get('people', {
    'personIds': player_id, 
    'hydrate': 'stats(group=[hitting],type=[season],season=%s)' % season
})

# 打印原始數據，檢查 structure
print(json.dumps(raw_stats, indent=4))

{
    "copyright": "Copyright 2025 MLB Advanced Media, L.P.  Use of any content on this page acknowledges agreement to the terms posted here http://gdx.mlb.com/components/copyright.txt",
    "people": [
        {
            "id": 660271,
            "fullName": "Shohei Ohtani",
            "link": "/api/v1/people/660271",
            "firstName": "Shohei",
            "lastName": "Ohtani",
            "primaryNumber": "17",
            "birthDate": "1994-07-05",
            "currentAge": 31,
            "birthCity": "Oshu",
            "birthCountry": "Japan",
            "height": "6' 3\"",
            "weight": 210,
            "active": true,
            "primaryPosition": {
                "code": "Y",
                "name": "Two-Way Player",
                "type": "Two-Way Player",
                "abbreviation": "TWP"
            },
            "useName": "Shohei",
            "useLastName": "Ohtani",
            "boxscoreName": "Ohtani",
            "nickName": "Showtime",
  

In [48]:
# --- 專案核心設定 ---
TARGET_PLAYER_LAST = 'Ohtani'
TARGET_PLAYER_FIRST = 'Shohei'
TARGET_SEASON = 2024

def calculate_bb_percent(base_on_balls, plate_appearances):
    """
    特徵工程：計算 BB% (保送率) - BaseOnBalls / PlateAppearances
    """
    if plate_appearances and plate_appearances > 0:
        # 結果取兩位小數，並轉換為百分比
        return round((base_on_balls / plate_appearances) * 100, 2)
    return 0.0

def get_player_id_and_position(last_name, first_name):
    """
    [異質檢索 - 來源 1: pybaseball] 查找球員 ID 和位置。
    修正了 'position' 欄位名稱錯誤。
    """
    try:
        print(f"--- 1. 執行異質檢索 (Retrieval) - 查找 {first_name} {last_name} 的 ID ---")
        
        # 查詢 ID
        id_lookup = playerid_lookup(last_name, first_name)
        
        if id_lookup.empty:
            print(f"❌ pybaseball 找不到 {last_name} {first_name} 的 ID 記錄。")
            return None, None
        
        # 修正：提取 MLBAM ID (key_mlbam)
        valid_ids = id_lookup['key_mlbam'].dropna()
        if valid_ids.empty:
            print("❌ 找到球員名，但缺乏有效的 MLBAM ID (key_mlbam)。")
            return None, None
            
        player_id = valid_ids.iloc[0]
        player_id = int(player_id) if pd.notna(player_id) else None

        # 修正：提取位置。pybaseball 中位置欄位通常為 'pos'。
        # 使用 .get() 檢查欄位是否存在，以增加魯棒性。
        player_position = id_lookup.loc[0, 'pos'] if 'pos' in id_lookup.columns else 'N/A'
        
        print(f"✅ 成功找到 ID (key_mlbam): {player_id}")
        print(f"✅ 球員位置 (pos): {player_position}")
        
        return player_id, player_position

    except Exception as e:
        print(f"❌ pybaseball ID 查找過程中發生未知錯誤: {e}")
        return None, None

def get_player_stats_and_process(player_id, player_position, season):
    """
    [異質檢索 - 來源 2: statsapi] 獲取數據並進行特徵工程。
    修正了 JSON 解析路徑，直接使用 statsapi.get。
    """
    print(f"--- 2. 獲取 {season} 賽季的核心統計數據 (使用 statsapi.get) ---")
    try:
        # 使用 statsapi.get 獲取原始數據，確保結構與調試結果一致。
        raw_stats = statsapi.get('people', {
            'personIds': player_id, 
            'hydrate': 'stats(group=[hitting],type=[season],season=%s)' % season
        })
        
        # --- 依據您提供的 JSON 結構進行安全解析 ---
        people_list = raw_stats.get('people')
        if not people_list: return {"Error": "API 數據結構錯誤：找不到 'people' 列表。"}

        primary_position = people_list[0].get('primaryPosition', {}).get('abbreviation', 'N/A')

        stats_list = people_list[0].get('stats')
        if not stats_list: return {"Error": "API 返回 stats 列表為空，無該賽季數據。"}
            
        splits_list = stats_list[0].get('splits')
        if not splits_list: return {"Error": "API 返回 splits 列表為空，無該賽季打擊數據。"}

        stat_data = splits_list[0].get('stat')
        if not stat_data: return {"Error": "API 返回 stat 數據為空。"}
        # --- 安全解析結束 ---

        # 3. 執行特徵工程：計算 BB%
        base_on_balls = stat_data.get('baseOnBalls', 0)
        plate_appearances = stat_data.get('plateAppearances', 0)
        bb_percent = calculate_bb_percent(base_on_balls, plate_appearances)
        
        # 4. 結構化輸出 (Table-to-Text Context)
        result = {
            "MLB_ID": int(player_id), 
            "賽季": int(season),     
            "守備位置": primary_position,
            "OPS (進攻指數)": stat_data.get('ops'),
            "HR (全壘打數)": int(stat_data.get('homeRuns', 0)),
            "BB% (保送率)": f"{bb_percent}%", 
            "PA (總打席數)": int(plate_appearances)
        }
                
        return result
        
    except Exception as e:
        return {"Error": f"查詢失敗或解析異常: {e}"}


# --- 主程序運行邏輯 ---
def run_mvp_demo():
    player_id, player_position = get_player_id_and_position(TARGET_PLAYER_LAST, TARGET_PLAYER_FIRST)

    if player_id is None:
        print("❌ 無法繼續執行數據獲取。請檢查 pybaseball 錯誤。")
        return

    mvp_result = get_player_stats_and_process(player_id, player_position, TARGET_SEASON)

    print("\n--- 3. 初步結果 (Preliminary Results) - 事實一致性呈現 (Table-to-Text Context) ---")
    
    if isinstance(mvp_result, dict) and "Error" in mvp_result:
        print(f"❌ 數據獲取失敗: {mvp_result['Error']}")
    else:
        print("這就是我們驗證【事實一致性】的結構化數據，並已完成 BB% 特徵計算:")
        print(json.dumps(mvp_result, indent=4, ensure_ascii=False))
        print(f"\n結論: 該 MVP 成功展示了**異質檢索**和**特徵工程**的初步能力，並確保數值 **100% 吻合**數據源。")

if __name__ == "__main__":
    run_mvp_demo()

--- 1. 執行異質檢索 (Retrieval) - 查找 Shohei Ohtani 的 ID ---
✅ 成功找到 ID (key_mlbam): 660271
✅ 球員位置 (pos): N/A
--- 2. 獲取 2024 賽季的核心統計數據 (使用 statsapi.get) ---

--- 3. 初步結果 (Preliminary Results) - 事實一致性呈現 (Table-to-Text Context) ---
這就是我們驗證【事實一致性】的結構化數據，並已完成 BB% 特徵計算:
{
    "MLB_ID": 660271,
    "賽季": 2024,
    "守備位置": "TWP",
    "OPS (進攻指數)": "1.036",
    "HR (全壘打數)": 54,
    "BB% (保送率)": "11.08%",
    "PA (總打席數)": 731
}

結論: 該 MVP 成功展示了**異質檢索**和**特徵工程**的初步能力，並確保數值 **100% 吻合**數據源。
