# ここでの内容



In [3]:

import os
import sys
from pathlib import Path
import collections
from datetime import datetime
import shutil
import yaml
from tqdm import tqdm
import time
from loguru import logger

from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import UnivariateSpline

import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

import statsmodels.api as sm
import optuna


START_TIME = time.time()
#####
EXP_NUM = "exp011"
#####


# hydraの初期設定と指定ハイパラの取得
sys.path.append(os.path.abspath("/workspace"))
sys.path.append("C:\\Users\\81809\\Documents\\ALL_CODE\\kaggle\\March Machine Learning Mania 2025")
with initialize_config_module(version_base=None, config_module="conf"):
    cfg = compose(
        config_name="base.yaml",
        overrides=[
            f"exp={EXP_NUM}" #defaultexp001->exp002のオーバーライド
        ],
    )
    
    OmegaConf.resolve(cfg)
    print("Omega conf is below")
    print(OmegaConf.to_yaml(cfg))

Omega conf is below
dir:
  input_dir: ../input
  output_dir: ../output
  src_path: ../src
  conf_path: ../conf
exp:
  name: 3rdsolution_optuna2
  print_name: これはbase(exp003)にOptuna最適化導入したやつ
  tournament:
    target_season: 2023
  features:
    cal_mean_boxscore:
      T1_Score: true
      T2_Score: true
      T1_FGM: true
      T2_FGM: true
      T1_FGA: true
      T2_FGA: true
      T1_FGM3: true
      T2_FGM3: true
      T1_FGA3: true
      T2_FGA3: true
      T1_FTM: true
      T2_FTM: true
      T1_FTA: true
      T2_FTA: true
      T1_OR: true
      T2_OR: true
      T1_DR: true
      T2_DR: true
      T1_Ast: true
      T2_Ast: true
      T1_TO: true
      T2_TO: true
      T1_Stl: true
      T2_Stl: true
      T1_Blk: true
      T2_Blk: true
      T1_PF: true
      T2_PF: true
      PointDiff: true
      T1_EFFG: false
      T2_EFFG: false
      T1_EFFG3: false
      T2_EFFG3: false
      T1_DARE: false
      T2_DARE: false
      T1_TOQUETOQUE: false
      T2_TOQUETOQUE: false
 

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
PREDICT_YEAR = cfg.exp.tournament.target_season
name = f"{PREDICT_YEAR}_{cfg.exp.name}"
now = str(datetime.now())
print("########")
print(f"output dir name: {name}")
print(f"Run on {now}")
print(cfg.exp.print_name)
print("########")

CONF_PATH = cfg.dir.conf_path
INPUT_PATH = cfg.dir.input_dir
EXP_PATH = os.path.join(CONF_PATH, "exp", f"{EXP_NUM}.yaml")
OUTPUT_PATH = os.path.join(cfg.dir.output_dir, name)
SRC_PATH = cfg.dir.src_path
sys.path.append(SRC_PATH)

os.makedirs(OUTPUT_PATH, exist_ok=True)  
with open(EXP_PATH, "r") as f:
    config = yaml.safe_load(f)
shutil.copyfile(EXP_PATH, os.path.join(OUTPUT_PATH, f"{EXP_NUM}.yaml"))



y_season = cfg.exp.tournament.target_season
FEATURES = cfg.exp.features
MODEL = cfg.exp.model
results = [now ,name, cfg.exp.print_name,""]

########
output dir name: 2023_3rdsolution_optuna2
Run on 2025-03-06 08:22:01.358116
これはbase(exp003)にOptuna最適化導入したやつ
########


In [6]:
import glob

csv_files = glob.glob(os.path.join(INPUT_PATH, "*.csv"))
dataframes = {}
for file in csv_files:
    key = os.path.splitext(os.path.basename(file))[0]
    print(key)

    dataframes[key] = pd.read_csv(file, low_memory=False, encoding="latin-1")
    
print("fin read all csv files")


Cities
Conferences
MConferenceTourneyGames
MGameCities
MMasseyOrdinals
MNCAATourneyCompactResults
MNCAATourneyDetailedResults
MNCAATourneySeedRoundSlots
MNCAATourneySeeds
MNCAATourneySlots
MRegularSeasonCompactResults
MRegularSeasonDetailedResults
MSeasons
MSecondaryTourneyCompactResults
MSecondaryTourneyTeams
MTeamCoaches
MTeamConferences
MTeams
MTeamSpellings
SampleSubmission2023
SampleSubmission2024
SampleSubmissionStage1
SampleSubmissionStage2
SeedBenchmarkStage1
WConferenceTourneyGames
WGameCities
WNCAATourneyCompactResults
WNCAATourneyDetailedResults
WNCAATourneySeeds
WNCAATourneySlots
WRegularSeasonCompactResults
WRegularSeasonDetailedResults
WSeasons
WSecondaryTourneyCompactResults
WSecondaryTourneyTeams
WTeamConferences
WTeams
WTeamSpellings
fin read all csv files


In [42]:
#1beats16

tourney_results = pd.concat([
    pd.read_csv(os.path.join(INPUT_PATH,"MNCAATourneyDetailedResults.csv")),
    pd.read_csv(os.path.join(INPUT_PATH,"WNCAATourneyDetailedResults.csv")),
], ignore_index=True)

seeds = pd.concat([
    pd.read_csv(os.path.join(INPUT_PATH,"MNCAATourneySeeds.csv")),
    pd.read_csv(os.path.join(INPUT_PATH,"WNCAATourneySeeds.csv")),
], ignore_index=True)

seeds = seeds.rename(columns={"TeamID":"WTeamID"})
tourney_results = pd.merge(tourney_results, seeds, on=["WTeamID", "Season"], how="left")
seeds = seeds.rename(columns={"WTeamID":"LTeamID"})
tourney_results = tourney_results.rename(columns={"Seed": "WSeed"})
tourney_results = pd.merge(tourney_results, seeds, on=["LTeamID", "Season"], how="left")
tourney_results = tourney_results.rename(columns={"Seed": "LSeed"})

tourney_results["WSeed"] = tourney_results["WSeed"].str[1:3].astype(int)
tourney_results["LSeed"] = tourney_results["LSeed"].str[1:3].astype(int)
#(138,2), (133,7), 130
# 条件を満たす行を抽出
reversed_matches = tourney_results[
    ((tourney_results["WSeed"] == 16) & (tourney_results["LSeed"] == 1)) |
    ((tourney_results["WSeed"] == 15) & (tourney_results["LSeed"] == 2)) |
    ((tourney_results["WSeed"] == 14) & (tourney_results["LSeed"] == 3))
]

# 抽出したデータをCSVに出力
reversed_matches.to_csv("reversed_matches.csv", index=False)


In [33]:
tourney_results[((tourney_results["WSeed"]==1) & (tourney_results["LSeed"]==16))]

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,WSeed,LSeed
1,2003,136,1112,80,1436,51,N,0,31,66,...,7,8,26,12,17,10,3,15,1,16
13,2003,136,1328,71,1354,54,N,0,24,52,...,8,7,22,8,18,7,2,23,1,16
21,2003,137,1246,95,1237,64,N,0,40,65,...,22,15,18,10,19,7,1,15,1,16
29,2003,137,1400,82,1421,61,N,0,31,62,...,12,13,16,5,11,9,3,16,1,16
68,2004,136,1181,96,1106,61,N,0,32,66,...,17,9,22,7,21,7,1,20,1,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2167,2023,138,3231,77,3399,47,H,0,29,50,...,6,11,19,9,8,4,1,13,1,16
2225,2024,137,3376,91,3342,39,H,0,36,64,...,4,8,15,10,15,4,0,11,1,16
2227,2024,137,3400,82,3180,42,H,0,31,71,...,6,7,11,11,20,3,3,15,1,16
2233,2024,138,3234,91,3221,65,H,0,31,67,...,20,11,23,15,13,8,3,19,1,16


In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import base64
from io import BytesIO
import io
import plotly.express as px  # Plotlyを使うため
import plotly.io as pio

def generate_eda_report(dataframes, output_html="eda_report.html"):
    """
    複数のDataFrameを含む辞書 dataframes から、EDA結果をHTMLとして出力する関数。
    特定のカラム (ID等) は分布可視化から除外。
    """
    
    # 分布可視化から除外したい数値カラム
    excluded_cols = ["Season", "DayNum", "TeamID", "Seed", "CityID", "LTeamID", "WTeamID"]
    
    html_parts = [
        "<html>",
        "<head><meta charset='utf-8'><title>EDA Report</title></head>",
        "<body>"
    ]
    
    # ---------- 1. 各DataFrameごとに基本情報をHTML化 ----------
    for key, df in dataframes.items():
        html_parts.append(f"<h1>DataFrame: {key}</h1>")
        
        # -- shape --
        html_parts.append(f"<p><b>Shape:</b> {df.shape}</p>")
        
        # -- Missing Value (テーブル表示) --
        missing_vals = df.isnull().sum()
        html_parts.append("<h3>Missing Values</h3>")
        html_parts.append("<table border='1'>")
        html_parts.append("<tr><th>Column</th><th>Missing Count</th></tr>")
        for col, val in missing_vals.items():
            html_parts.append(f"<tr><td>{col}</td><td>{val}</td></tr>")
        html_parts.append("</table>")
        
        # -- Head (先頭3行) --
        html_parts.append("<h3>Data Sample (head 3)</h3>")
        html_parts.append(df.head(3).to_html())
        
        # -- info() の出力を文字列として取得 --
        buffer = io.StringIO()
        df.info(buf=buffer)
        info_str = buffer.getvalue()
        html_parts.append("<h3>Data Info</h3>")
        html_parts.append(f"<pre>{info_str}</pre>")
        
        # -- 数値カラムの基本統計量 --
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            desc_html = df[numeric_cols].describe().to_html()
            html_parts.append("<h3>Descriptive Statistics (Numeric Columns)</h3>")
            html_parts.append(desc_html)
        
        # -- 数値カラムの分布可視化 (histogram + kde) --
        #     excluded_colsに含まれるカラムは表示しない
        distribution_cols = [col for col in numeric_cols if col not in excluded_cols]
        
        if len(distribution_cols) > 0:
            html_parts.append("<h3>Numeric Columns Distribution</h3>")
        
            for col in distribution_cols:
                fig, ax = plt.subplots()
                sns.histplot(df[col].dropna(), ax=ax, kde=True)
                ax.set_title(f"Distribution of {col}")
                
                buf = BytesIO()
                plt.savefig(buf, format='png')
                plt.close(fig)
                buf.seek(0)
                
                encoded = base64.b64encode(buf.getvalue()).decode('utf-8')
                html_parts.append(f"<h4>{col}</h4>")
                html_parts.append(f"<img src='data:image/png;base64,{encoded}'/><br>")
        
        html_parts.append("<hr>")
    
    # ---------- 2. Men/Women向け特別可視化（共通処理） ----------
    if 'MRegularSeasonDetailedResults' in dataframes and 'WRegularSeasonDetailedResults' in dataframes:
        html_parts.append("<h2>Men & Women Detailed Results Analysis</h2>")
        
        df_MReg = dataframes['MRegularSeasonDetailedResults']
        df_WReg = dataframes['WRegularSeasonDetailedResults']
        
        df_MReg['ScoreDiff'] = df_MReg['WScore'] - df_MReg['LScore']
        df_WReg['ScoreDiff'] = df_WReg['WScore'] - df_WReg['LScore']
        
        # (1) Histograms (Winning Scores)
        html_parts.append("<h3>Winning Scores Distribution</h3>")
        fig, axes = plt.subplots(1, 2, figsize=(15, 6), sharey=True)
        sns.histplot(df_MReg['WScore'], kde=True, ax=axes[0], color='blue')
        axes[0].set_title("Men's Winning Scores Distribution")
        sns.histplot(df_WReg['WScore'], kde=True, ax=axes[1], color='green')
        axes[1].set_title("Women's Winning Scores Distribution")
        buf = BytesIO()
        plt.savefig(buf, format='png')
        plt.close(fig)
        buf.seek(0)
        encoded = base64.b64encode(buf.getvalue()).decode('utf-8')
        html_parts.append(f"<img src='data:image/png;base64,{encoded}'/><br>")
        
        # (2) Boxplots (ScoreDiff)
        html_parts.append("<h3>Score Difference (Margin) Boxplots</h3>")
        fig, axes = plt.subplots(1, 2, figsize=(15, 6), sharey=True)
        sns.boxplot(y=df_MReg['ScoreDiff'], ax=axes[0], color='lightblue')
        axes[0].set_title("Men's Score Difference")
        sns.boxplot(y=df_WReg['ScoreDiff'], ax=axes[1], color='lightgreen')
        axes[1].set_title("Women's Score Difference")
        buf = BytesIO()
        plt.savefig(buf, format='png')
        plt.close(fig)
        buf.seek(0)
        encoded = base64.b64encode(buf.getvalue()).decode('utf-8')
        html_parts.append(f"<img src='data:image/png;base64,{encoded}'/><br>")
        
        # (3) Pairplot
        html_parts.append("<h3>Pairplot Sample (Men's Detailed Regular Season)</h3>")
        selected_features = ['WScore','LScore','WFGM','WFGM3','WOR','ScoreDiff']
        sample_size = min(len(df_MReg), 2000)
        sample_m = df_MReg[selected_features].sample(sample_size, random_state=42)
        g = sns.pairplot(sample_m)
        g.fig.suptitle("Pairplot Sample: Men's Detailed Regular Season Data", y=1.02)
        buf = BytesIO()
        g.fig.savefig(buf, format='png')
        plt.close(g.fig)
        buf.seek(0)
        encoded = base64.b64encode(buf.getvalue()).decode('utf-8')
        html_parts.append(f"<img src='data:image/png;base64,{encoded}'/><br>")
        
        # (4) Correlation Heatmaps (Men/Women)
        html_parts.append("<h3>Correlation Heatmaps</h3>")
        cols = ['WScore','LScore','WFGM','WFGA','WFGM3','WFTA','WOR','WTO','WStl','ScoreDiff']
        corr_m = df_MReg[cols].corr()
        corr_w = df_WReg[cols].corr()
        
        fig, axes = plt.subplots(1, 2, figsize=(18, 6))
        sns.heatmap(corr_m, ax=axes[0], annot=True, cmap="Blues")
        axes[0].set_title("Men's Detailed Regular Season Correlations")
        sns.heatmap(corr_w, ax=axes[1], annot=True, cmap="Greens")
        axes[1].set_title("Women's Detailed Regular Season Correlations")
        buf = BytesIO()
        plt.savefig(buf, format='png')
        plt.close(fig)
        buf.seek(0)
        encoded = base64.b64encode(buf.getvalue()).decode('utf-8')
        html_parts.append(f"<img src='data:image/png;base64,{encoded}'/><br>")
        
        # (5) Time series (Season trend)
        html_parts.append("<h3>Average Scores by Season</h3>")
        
        # Men
        if 'Season' in df_MReg.columns:
            avg_scores_m = df_MReg.groupby('Season')[['WScore','LScore']].mean().reset_index()
            fig, ax = plt.subplots(figsize=(10,6))
            ax.plot(avg_scores_m['Season'], avg_scores_m['WScore'], marker='o', label='Mean Winning Score')
            ax.plot(avg_scores_m['Season'], avg_scores_m['LScore'], marker='s', label='Mean Losing Score')
            ax.set_title("Average Scores by Season (Men)")
            ax.set_xlabel("Season")
            ax.set_ylabel("Score")
            ax.legend()
            buf = BytesIO()
            plt.savefig(buf, format='png')
            plt.close(fig)
            buf.seek(0)
            encoded = base64.b64encode(buf.getvalue()).decode('utf-8')
            html_parts.append(f"<img src='data:image/png;base64,{encoded}'/><br>")
        
        # Women
        if 'Season' in df_WReg.columns:
            avg_scores_w = df_WReg.groupby('Season')[['WScore','LScore']].mean().reset_index()
            fig, ax = plt.subplots(figsize=(10,6))
            ax.plot(avg_scores_w['Season'], avg_scores_w['WScore'], marker='o', color='green', label='Mean Winning Score')
            ax.plot(avg_scores_w['Season'], avg_scores_w['LScore'], marker='s', color='olive', label='Mean Losing Score')
            ax.set_title("Average Scores by Season (Women)")
            ax.set_xlabel("Season")
            ax.set_ylabel("Score")
            ax.legend()
            buf = BytesIO()
            plt.savefig(buf, format='png')
            plt.close(fig)
            buf.seek(0)
            encoded = base64.b64encode(buf.getvalue()).decode('utf-8')
            html_parts.append(f"<img src='data:image/png;base64,{encoded}'/><br>")
        
        html_parts.append("<p>Men/Women-specific analysis complete.</p>")
    else:
        html_parts.append("<h2>Men & Women Detailed Results Analysis: Skipped</h2>")
        html_parts.append("<p>MRegularSeasonDetailedResults or WRegularSeasonDetailedResults not found.</p>")
    
    
    # ---------- 3. 追加の可視化（末尾に配置） ----------
    html_parts.append("<h2>Additional Visualizations</h2>")
    
    # 3-1) Tournament seeds distribution (Men/Women)
    if 'MNCAATourneySeeds' in dataframes and 'WNCAATourneySeeds' in dataframes:
        html_parts.append("<h3>Tournament Seeds Distribution</h3>")
        
        # parse_seed 関数
        def parse_seed(seed_str):
            # 'W01a'などの文字列から数字部分のみ取り出す
            return int(''.join(ch for ch in str(seed_str) if ch.isdigit()))
        
        df_MSeeds = dataframes['MNCAATourneySeeds'].copy()
        df_WSeeds = dataframes['WNCAATourneySeeds'].copy()
        
        df_MSeeds['SeedNum'] = df_MSeeds['Seed'].apply(parse_seed)
        df_WSeeds['SeedNum'] = df_WSeeds['Seed'].apply(parse_seed)
        
        fig, axes = plt.subplots(1, 2, figsize=(15,5), sharey=True)
        sns.countplot(x='SeedNum', data=df_MSeeds, ax=axes[0], color='blue')
        axes[0].set_title("Men's Tournament Seeds Distribution")
        sns.countplot(x='SeedNum', data=df_WSeeds, ax=axes[1], color='green')
        axes[1].set_title("Women's Tournament Seeds Distribution")
        
        buf = BytesIO()
        plt.savefig(buf, format='png')
        plt.close(fig)
        buf.seek(0)
        encoded = base64.b64encode(buf.getvalue()).decode('utf-8')
        html_parts.append(f"<img src='data:image/png;base64,{encoded}'/><br>")
        
        html_parts.append("<p>Tournament seeds distribution plots generated.</p>")
    
    # 3-2) Merge team information from MTeamSpellings
    if 'MTeamSpellings' in dataframes and 'MRegularSeasonDetailedResults' in dataframes:
        html_parts.append("<h3>MTeamSpellings Merge Example</h3>")
        
        df_MTeamSpellings = dataframes['MTeamSpellings'].copy()
        # カラム名を小文字に
        df_MTeamSpellings.columns = [col.lower() for col in df_MTeamSpellings.columns]
        
        df_MReg = dataframes['MRegularSeasonDetailedResults'].copy()
        # Merge on MRegularSeasonDetailedResults's WTeamID and MTeamSpellings's teamid
        df_MReg_merged = df_MReg.merge(df_MTeamSpellings, left_on='WTeamID', right_on='teamid', how='left')
        df_MReg_merged.rename(columns={'teamname': 'WTeamName'}, inplace=True)
        
        # 先頭3行を表示
        merged_html = df_MReg_merged.head(3).to_html()
        html_parts.append("<p>Merged men's detailed results with MTeamSpellings (top 3 rows):</p>")
        html_parts.append(merged_html)
    
    # 3-3) USA map visualization of game cities with Plotly
    if 'Cities' in dataframes:
        html_parts.append("<h3>USA Map Visualization of Game Cities (Plotly)</h3>")
        
        df_Cities = dataframes['Cities'].copy()
        
        # State座標を定義（省略なく全部）
        state_coords = {
            'AL': (32.806671, -86.791130), 'AK': (61.370716, -152.404419),
            'AZ': (33.729759, -111.431221), 'AR': (34.969704, -92.373123),
            'CA': (36.116203, -119.681564), 'CO': (39.059811, -105.311104),
            'CT': (41.597782, -72.755371), 'DE': (39.318523, -75.507141),
            'FL': (27.766279, -81.686783), 'GA': (33.040619, -83.643074),
            'HI': (21.094318, -157.498337), 'ID': (44.240459, -114.478828),
            'IL': (40.349457, -88.986137), 'IN': (39.849426, -86.258278),
            'IA': (42.011539, -93.210526), 'KS': (38.526600, -96.726486),
            'KY': (37.668140, -84.670067), 'LA': (31.169546, -91.867805),
            'ME': (44.693947, -69.381927), 'MD': (39.063946, -76.802101),
            'MA': (42.230171, -71.530106), 'MI': (43.326618, -84.536095),
            'MN': (45.694454, -93.900192), 'MS': (32.741646, -89.678696),
            'MO': (38.456085, -92.288368), 'MT': (46.921925, -110.454353),
            'NE': (41.125370, -98.268082), 'NV': (38.313515, -117.055374),
            'NH': (43.452492, -71.563896), 'NJ': (40.298904, -74.521011),
            'NM': (34.840515, -106.248482), 'NY': (42.165726, -74.948051),
            'NC': (35.630066, -79.806419), 'ND': (47.528912, -99.784012),
            'OH': (40.388783, -82.764915), 'OK': (35.565342, -96.928917),
            'OR': (44.572021, -122.070938), 'PA': (40.590752, -77.209755),
            'RI': (41.680893, -71.511780), 'SC': (33.856892, -80.945007),
            'SD': (44.299782, -99.438828), 'TN': (35.747845, -86.692345),
            'TX': (31.054487, -97.563461), 'UT': (40.150032, -111.862434),
            'VT': (44.045876, -72.710686), 'VA': (37.769337, -78.169968),
            'WA': (47.400902, -121.490494), 'WV': (38.491226, -80.954453),
            'WI': (44.268543, -89.616508), 'WY': (42.755966, -107.302490)
        }
        
        # lat/lon列を追加（州ベースの大まかな座標）
        df_Cities['lat'] = df_Cities['State'].apply(lambda x: state_coords.get(x, (np.nan, np.nan))[0])
        df_Cities['lon'] = df_Cities['State'].apply(lambda x: state_coords.get(x, (np.nan, np.nan))[1])
        df_USCities = df_Cities.dropna(subset=['lat', 'lon'])
        
        fig_map = px.scatter_geo(
            df_USCities,
            lat='lat',
            lon='lon',
            hover_name='City',
            scope='usa',
            title="Game Cities from the Dataset (USA)",
            color='State',
            size_max=10
        )
        
        # Plotlyの図をHTML化
        plotly_html = pio.to_html(fig_map, include_plotlyjs='cdn', full_html=False)
        html_parts.append(plotly_html)
        html_parts.append("<p>USA map visualization of game cities complete.</p>")
    
    # 3-4) Additional Explorations (RegularSeasonCompact, TourneyCompact, SecondaryTourneyCompact)
    html_parts.append("<h3>Additional Exploratory Visualizations</h3>")
    
    # A) Regular Season Compact
    if 'MRegularSeasonCompactResults' in dataframes and 'WRegularSeasonCompactResults' in dataframes:
        df_MRegCompact = dataframes['MRegularSeasonCompactResults']
        df_WRegCompact = dataframes['WRegularSeasonCompactResults']
        
        fig, axes = plt.subplots(1, 2, figsize=(15,6), sharey=True)
        sns.histplot(df_MRegCompact['WScore'], kde=True, ax=axes[0], color='navy')
        axes[0].set_title("Men's Regular Season Compact - Winning Score")
        sns.histplot(df_WRegCompact['WScore'], kde=True, ax=axes[1], color='darkgreen')
        axes[1].set_title("Women's Regular Season Compact - Winning Score")
        buf = BytesIO()
        plt.savefig(buf, format='png')
        plt.close(fig)
        buf.seek(0)
        encoded = base64.b64encode(buf.getvalue()).decode('utf-8')
        html_parts.append(f"<img src='data:image/png;base64,{encoded}'/><br>")
    
    # B) NCAA Tourney Compact
    if 'MNCAATourneyCompactResults' in dataframes and 'WNCAATourneyCompactResults' in dataframes:
        df_MTourneyCompact = dataframes['MNCAATourneyCompactResults']
        df_WTourneyCompact = dataframes['WNCAATourneyCompactResults']
        
        fig, axes = plt.subplots(1, 2, figsize=(15,6), sharey=True)
        sns.histplot(df_MTourneyCompact['WScore'], kde=True, ax=axes[0], color='purple')
        axes[0].set_title("Men's NCAA Tourney Compact - Winning Score")
        sns.histplot(df_WTourneyCompact['WScore'], kde=True, ax=axes[1], color='orange')
        axes[1].set_title("Women's NCAA Tourney Compact - Winning Score")
        buf = BytesIO()
        plt.savefig(buf, format='png')
        plt.close(fig)
        buf.seek(0)
        encoded = base64.b64encode(buf.getvalue()).decode('utf-8')
        html_parts.append(f"<img src='data:image/png;base64,{encoded}'/><br>")
    
    # C) Secondary Tourney Compact
    if 'MSecondaryTourneyCompactResults' in dataframes:
        df_MSecCompact = dataframes['MSecondaryTourneyCompactResults']
        fig, ax = plt.subplots(figsize=(8,5))
        sns.histplot(df_MSecCompact['WScore'], kde=True, color='teal')
        ax.set_title("Men's Secondary Tournament Compact - Winning Score")
        buf = BytesIO()
        plt.savefig(buf, format='png')
        plt.close(fig)
        buf.seek(0)
        encoded = base64.b64encode(buf.getvalue()).decode('utf-8')
        html_parts.append(f"<img src='data:image/png;base64,{encoded}'/><br>")
    
    if 'WSecondaryTourneyCompactResults' in dataframes:
        df_WSecCompact = dataframes['WSecondaryTourneyCompactResults']
        fig, ax = plt.subplots(figsize=(8,5))
        sns.histplot(df_WSecCompact['WScore'], kde=True, color='coral')
        ax.set_title("Women's Secondary Tournament Compact - Winning Score")
        buf = BytesIO()
        plt.savefig(buf, format='png')
        plt.close(fig)
        buf.seek(0)
        encoded = base64.b64encode(buf.getvalue()).decode('utf-8')
        html_parts.append(f"<img src='data:image/png;base64,{encoded}'/><br>")
    
    html_parts.append("<p>Additional exploratory visualizations complete.</p>")
    
    # ---------- HTML閉じタグ ----------
    html_parts.append("</body></html>")
    
    # ---------- ファイル書き出し ----------
    with open(output_html, "w", encoding="utf-8") as f:
        f.write("\n".join(html_parts))
    
    print(f"レポートを {output_html} に保存しました。")


In [29]:
generate_eda_report(dataframes)


use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.


use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.


use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.


use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.


use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.


use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.


use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.


use_inf_as_na option is deprecated and will be removed in a future version. Convert inf v

レポートを eda_report.html に保存しました。
