<a href="https://colab.research.google.com/github/maho1224/med/blob/main/bert_bmi_analysis_updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# -*- coding: utf-8 -*-
"""
医療データ分析ノートブック（修正版）
 - 医学BERT使用（jmedroberta）
 - 多重検定補正（Benjamini-Hochberg）
 - LOOCV導入
"""

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
from statsmodels.stats.multitest import multipletests
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

# === データ読み込み ===
df = pd.read_excel('/content/drive/MyDrive/summary_with_scores.xlsx')
bmi_df = pd.read_csv('/content/drive/MyDrive/体重改/merged_bmi_data.csv')



KeyboardInterrupt: 

In [None]:
# ID整形とマージ
df['ID'] = df['ID'].astype(str).str.split('.').str[0]
bmi_df['ID'] = bmi_df['ID'].astype(str)
df = df.merge(bmi_df[['ID', 'Initial BMI', 'Final BMI']], on='ID', how='left')
df = df[df['Initial BMI'].notna() & df['Final BMI'].notna()].copy()
df['Delta_BMI'] = df['Final BMI'] - df['Initial BMI']
df['Good_Poor'] = df['Final BMI'].apply(lambda x: 'Good' if x >= 14 else 'Poor')

# === モデル（医学特化BERT） ===
model = SentenceTransformer('alabnii/jmedroberta-base-manbyo-wordpiece')

# === セクションマップ ===
section_map = {
    '入院_主訴': 'Chief Complaint',
    '入院_家族背景': 'Family History',
    '入院_社会背景': 'Social Background',
    '入院_現病歴': 'Present Illness',
    '入院_精神医学的現症': 'Mental State',
    '退院_検査所見': 'Test Results',
    '退院_入院後経過': 'Hospital Course'
}



In [None]:
# === 回帰 + 多重検定補正 ===
results_bmi = []
for col_jp, col_en in section_map.items():
    temp = df[[col_jp, 'Final BMI']].dropna()
    if temp.empty: continue
    X = model.encode(temp[col_jp].astype(str).tolist())
    y = temp['Final BMI'].values

    pca = PCA(n_components=min(10, X.shape[0] // 2))
    X_pca = pca.fit_transform(X)
    X_df = pd.DataFrame(X_pca)

    X_const = sm.add_constant(X_df)
    model_ols = sm.OLS(y, X_const).fit()

    # FDR補正（Benjamini-Hochberg）
    raw_pvals = model_ols.pvalues.drop('const')
    _, fdr_corrected, _, _ = multipletests(raw_pvals, alpha=0.05, method='fdr_bh')
    sig_pvals = fdr_corrected[fdr_corrected < 0.05]

    results_bmi.append({
        'Section': col_en,
        'Num_Significant_PC (FDR)': len(sig_pvals),
        'Min_FDR_p_value': sig_pvals.min() if len(sig_pvals) > 0 else None,
        'R_squared': model_ols.rsquared
    })





In [None]:
# === 分類性能（LOOCV） ===
results_clf = []
label_enc = LabelEncoder()
df['Label'] = label_enc.fit_transform(df['Good_Poor'])

for col_jp, col_en in section_map.items():
    temp = df[[col_jp, 'Label']].dropna()
    if temp.empty: continue
    X = model.encode(temp[col_jp].astype(str).tolist())
    y = temp['Label'].values

    pca = PCA(n_components=min(10, X.shape[0] // 2))
    X_pca = pca.fit_transform(X)

    clf = LogisticRegression(max_iter=1000)
    loo = LeaveOneOut()
    scores = cross_val_score(clf, X_pca, y, cv=loo, scoring='accuracy')
    results_clf.append({
        'Section': col_en,
        'LOOCV_Accuracy': scores.mean(),
        'Samples': len(y)
    })

# === 結果表示 ===
results_bmi_df = pd.DataFrame(results_bmi).sort_values('Min_FDR_p_value')
results_clf_df = pd.DataFrame(results_clf).sort_values('LOOCV_Accuracy', ascending=False)

print("\n📊 BERT埋め込み + PCA + 回帰（FDR補正）:")
display(results_bmi_df)

print("\n📈 LOOCVによる分類精度:")
display(results_clf_df)

In [None]:
# === 保存 ===
results_bmi_df.to_excel('/content/drive/MyDrive/bert_bmi_regression_fdr.xlsx', index=False)
results_clf_df.to_excel('/content/drive/MyDrive/bert_bmi_classification_loocv.xlsx', index=False)
print("✅ 分析結果を保存しました")
