<a href="https://colab.research.google.com/github/maho1224/med/blob/main/untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from transformers import AutoTokenizer, AutoModel
import glob


In [None]:
# === データ読み込み ===
df = pd.read_excel('/content/drive/MyDrive/summary_processed.xlsx')
df['退院日'] = pd.to_datetime(df['退院日'], errors='coerce')
df['入院日'] = pd.to_datetime(df['入院日'], errors='coerce')
df['ID'] = df['ID'].astype(str)

# 体重データ読み込み
bmi_data = []
weight_files_path = '/content/drive/MyDrive/体重改/*_standardized.xlsx'
weight_files = glob.glob(weight_files_path)

for filepath in weight_files:
    filename = filepath.split('/')[-1]
    patient_id = filename.split('_')[0]
    patient_id = str(patient_id)
    if patient_id not in df['ID'].values:
        continue

    df_weight = pd.read_excel(filepath)
    df_weight.dropna(inplace=True)
    df_weight.reset_index(drop=True, inplace=True)
    df_weight['Date'] = pd.to_datetime(df_weight['Date'], errors='coerce')

    row = df[df['ID'] == patient_id].iloc[0]
    admission_date = row['入院日']
    discharge_date = row['退院日']

    initial_bmi_row = df_weight[df_weight['Date'] == admission_date]
    initial_bmi = initial_bmi_row['BMI'].values[0] if not initial_bmi_row.empty else np.nan

    final_bmi_row = df_weight[df_weight['Date'] == discharge_date]
    if not final_bmi_row.empty:
        final_bmi = final_bmi_row['BMI'].values[0]
    else:
        df_before_discharge = df_weight[df_weight['Date'] <= discharge_date]
        if not df_before_discharge.empty:
            df_before_discharge = df_before_discharge.copy()
            df_before_discharge['DateDiff'] = (discharge_date - df_before_discharge['Date']).abs()
            nearest_row = df_before_discharge.loc[df_before_discharge['DateDiff'].idxmin()]
            final_bmi = nearest_row['BMI']
        else:
            final_bmi = np.nan

    bmi_data.append([patient_id, initial_bmi, final_bmi])
    print(f"{patient_id}: Initial BMI = {initial_bmi}, Final BMI = {final_bmi}")

5479391: Initial BMI = 13.77046290788111, Final BMI = 14.73439531143279
5465819: Initial BMI = nan, Final BMI = 16.33866932836337
5479173: Initial BMI = 10.98363901883616, Final BMI = 13.7682235588228
5492262: Initial BMI = 12.73116652849753, Final BMI = 14.92
5201440: Initial BMI = 14.42815405712013, Final BMI = 16.28724393562861
5454831: Initial BMI = 11.01655095872175, Final BMI = 13.47606000997126
5474976: Initial BMI = 19.80435654223565, Final BMI = 18.38232743850244
5353008: Initial BMI = 18.40044254228899, Final BMI = 15.8383556060209
4937786: Initial BMI = 10.79, Final BMI = 16.03
5398821: Initial BMI = 14.11626311145966, Final BMI = 17.34034789617576
5329960: Initial BMI = 10.71158442443151, Final BMI = 16.02884575742269
3266654: Initial BMI = 11.49292726054506, Final BMI = 15.76286898581701
5418998: Initial BMI = 13.22568866891152, Final BMI = 16.06556046898454
4611383: Initial BMI = 13.7231833099314, Final BMI = 16.52639453482593
5396186: Initial BMI = 11.51644854824754, Fin

In [None]:
bmi_df = pd.DataFrame(bmi_data, columns=['ID', 'Initial BMI', 'Final BMI'])
df = df.merge(bmi_df, on='ID', how='left')
df = df[df['Initial BMI'].notna() & df['Final BMI'].notna()].copy()
df['Delta_BMI'] = df['Final BMI'] - df['Initial BMI']
df['Good_Outcome'] = df['Final BMI'].apply(lambda x: 1 if x >= 16 else 0)

#Good_Outcomeは何人
print(df['Good_Outcome'].value_counts())

Good_Outcome
1    35
0    13
Name: count, dtype: int64


In [None]:
# === セクション定義 ===
section_map = {
    '入院_主訴': 'Chief Complaint',
    '入院_家族背景': 'Family History',
    '入院_社会背景': 'Social Background',
    '入院_現病歴': 'Present Illness',
    '入院_精神医学的現症': 'Mental State',
    '退院_検査所見': 'Test Results',
    '退院_入院後経過': 'Hospital Course'
}


In [None]:
# === TF-IDF（LOOCV）===
tfidf_results = []
for col_jp, col_en in section_map.items():
    print(f"[TF-IDF] {col_en}")
    try:
        temp = df[[col_jp, 'Good_Outcome']].dropna()
        if temp.shape[0] < 5: continue
        texts = temp[col_jp].astype(str).tolist()
        y = temp['Good_Outcome'].values
        vectorizer = TfidfVectorizer(max_features=1000)
        X = vectorizer.fit_transform(texts).toarray()
        loo = LeaveOneOut()
        preds, probs, trues = [], [], []
        for train_idx, test_idx in loo.split(X):
            clf = LogisticRegression(max_iter=1000)
            clf.fit(X[train_idx], y[train_idx])
            preds.append(clf.predict(X[test_idx])[0])
            probs.append(clf.predict_proba(X[test_idx])[0][1])
            trues.append(y[test_idx][0])
        tfidf_results.append({
            'Section': col_en,
            'AUC': roc_auc_score(trues, probs),
            'Accuracy': accuracy_score(trues, preds),
            'Confusion Matrix': confusion_matrix(trues, preds).tolist(),
            'Error': None
        })
    except Exception as e:
        tfidf_results.append({
            'Section': col_en,
            'AUC': None, 'Accuracy': None, 'Confusion Matrix': None, 'Error': str(e)
        })
tfidf_df = pd.DataFrame(tfidf_results).sort_values('AUC', ascending=False)
print("\n=== TF-IDF結果 ===")
print(tfidf_df)


[TF-IDF] Chief Complaint
[TF-IDF] Family History
[TF-IDF] Social Background
[TF-IDF] Present Illness
[TF-IDF] Mental State
[TF-IDF] Test Results
[TF-IDF] Hospital Course

=== TF-IDF結果 ===
             Section       AUC  Accuracy    Confusion Matrix Error
1     Family History  0.672619  0.700000  [[0, 12], [0, 28]]  None
3    Present Illness  0.501241  0.704545  [[0, 13], [0, 31]]  None
5    Hospital Course  0.432292  0.727273  [[0, 12], [0, 32]]  None
0    Chief Complaint  0.323077  0.697674  [[0, 13], [0, 30]]  None
4       Mental State  0.284615  0.697674  [[0, 13], [0, 30]]  None
2  Social Background  0.280556  0.714286  [[0, 12], [0, 30]]  None


🔍 表の意味
Section	AUC	Accuracy	Confusion Matrix	Error
例: "Family History"	0.67	0.70	[[0, 12], [0, 28]]	None

Section：文章の種類（例：主訴、家族歴、社会的背景など）

AUC（Area Under the ROC Curve）：

モデルが「予後がGoodかPoorか」をどれくらい判別できているかの指標（0.5がランダム、1.0が完全）

高いほど良い（一般に0.7以上で意味あり）

Accuracy：

正しく分類できた割合（例：70%）

ただしクラス不均衡に弱い（例：Goodが多ければ全部Goodと予測しても高く見える）

Confusion Matrix：

[[TN, FP], [FN, TP]] の形式（例： [[0,12], [0,28]] は全てをPositive（Good）に予測してる）

偏った予測傾向がないか確認できる

🧠 どう解釈する？
AUCが0.5前後 → 予測できていない

例：「Mental State」「Test Results」はAUC 0.28〜0.33 → 無作為と変わらない

AUCが0.67などそこそこ高い → 有望なセクション

「Family History」がAUC 0.67 → 家族歴の情報が予後とやや関係していそう

Confusion Matrixの偏り注意

例：[[0, 12], [0, 28]] → 全部を「Good」と予測している（TN, FNゼロ）

AUCが高くても、偏った予測の可能性があるので注意（たとえば陽性率が高すぎるなど）

✅ 解釈のポイント
AUC優先で比較するのが安全（Accuracyはクラス不均衡に影響されやすい）

Confusion Matrixで予測の偏りを確認する

セクションごとのAUCを並べて、「どの記述が有用か」ランキングとして見るのもOK


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneOut

# ==== SentencePieceモデル（語彙32k） ====
tokenizer = AutoTokenizer.from_pretrained("alabnii/jmedroberta-base-sentencepiece")
bert_model = AutoModel.from_pretrained("alabnii/jmedroberta-base-sentencepiece")

def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
            cls = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
            embeddings.append(cls)
    return np.array(embeddings)

# ==== BERT + PCA + LOOCV ====
bert_results = {}
for col_jp, col_en in section_map.items():
    print(f"\n🔍 処理中: {col_jp} ({col_en})")
    try:
        temp = df[[col_jp, 'Good_Outcome']].dropna()
        if temp.shape[0] < 3:
            print("⚠ データが少ないためスキップ")
            continue

        texts = temp[col_jp].astype(str).tolist()
        y = temp["Good_Outcome"].values

        X_bert = get_bert_embeddings(texts)
        pca = PCA(n_components=min(10, X_bert.shape[1]))  # 過学習防止で次元を制限
        X = pca.fit_transform(X_bert)

        loo = LeaveOneOut()
        preds, trues, probs = [], [], []
        for train_idx, test_idx in loo.split(X):
            clf = LogisticRegression(max_iter=1000)
            clf.fit(X[train_idx], y[train_idx])
            preds.append(clf.predict(X[test_idx])[0])
            probs.append(clf.predict_proba(X[test_idx])[0][1])
            trues.append(y[test_idx][0])

        # 必要なら結果格納
        bert_results[col_en] = {"y_true": trues, "y_pred": preds, "prob": probs}

    except Exception as e:
        print(f"❌ エラー: {e}")


Some weights of BertModel were not initialized from the model checkpoint at alabnii/jmedroberta-base-sentencepiece and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🔍 処理中: 入院_主訴 (Chief Complaint)

🔍 処理中: 入院_家族背景 (Family History)

🔍 処理中: 入院_社会背景 (Social Background)

🔍 処理中: 入院_現病歴 (Present Illness)

🔍 処理中: 入院_精神医学的現症 (Mental State)

🔍 処理中: 退院_検査所見 (Test Results)
⚠ データが少ないためスキップ

🔍 処理中: 退院_入院後経過 (Hospital Course)


In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
import pandas as pd

bert_summary = []

for section, res in bert_results.items():
    try:
        y_true = res['y_true']
        y_pred = res['y_pred']
        prob = res['prob']
        auc = roc_auc_score(y_true, prob)
        acc = accuracy_score(y_true, y_pred)
        cm = confusion_matrix(y_true, y_pred).tolist()

        bert_summary.append({
            "Section": section,
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm,
            "Error": None
        })
    except Exception as e:
        bert_summary.append({
            "Section": section,
            "AUC": None,
            "Accuracy": None,
            "Confusion Matrix": None,
            "Error": str(e)
        })

bert_df = pd.DataFrame(bert_summary).sort_values("AUC", ascending=False)
print("\n=== BERT結果 ===")
print(bert_df)



=== BERT結果 ===
             Section       AUC  Accuracy    Confusion Matrix Error
1     Family History  0.776786  0.775000   [[8, 4], [5, 23]]  None
0    Chief Complaint  0.617949  0.720930   [[6, 7], [5, 25]]  None
4       Mental State  0.574359  0.604651  [[3, 10], [7, 23]]  None
2  Social Background  0.569444  0.690476   [[4, 8], [5, 25]]  None
3    Present Illness  0.454094  0.568182  [[2, 11], [8, 23]]  None
5    Hospital Course  0.453125  0.590909  [[1, 11], [7, 25]]  None


In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
import statsmodels.api as sm
import pandas as pd

bert_summary = []

for section, result in bert_results.items():
    try:
        y_true = result['y_true']
        y_pred = result['y_pred']
        prob = result['prob']

        acc = accuracy_score(y_true, y_pred)
        auc = roc_auc_score(y_true, prob)
        cm = confusion_matrix(y_true, y_pred).tolist()

        # p値のためのロジスティック回帰（確率で予測）
        X = sm.add_constant(prob)
        model = sm.Logit(y_true, X).fit(disp=0)
        p_val = model.pvalues[1]

        bert_summary.append({
            'Section': section,
            'AUC': auc,
            'Accuracy': acc,
            'Confusion Matrix': cm,
            'p-value': p_val
        })
    except Exception as e:
        bert_summary.append({
            'Section': section,
            'AUC': None,
            'Accuracy': None,
            'Confusion Matrix': None,
            'p-value': None,
            'Error': str(e)
        })

bert_df = pd.DataFrame(bert_summary).sort_values('AUC', ascending=False)
print("\n=== 🧠 BERT分類結果 ===")
display(bert_df)



=== 🧠 BERT分類結果 ===


Unnamed: 0,Section,AUC,Accuracy,Confusion Matrix,p-value
1,Family History,0.776786,0.775,"[[8, 4], [5, 23]]",0.00516
0,Chief Complaint,0.617949,0.72093,"[[6, 7], [5, 25]]",0.107259
4,Mental State,0.574359,0.604651,"[[3, 10], [7, 23]]",0.727477
2,Social Background,0.569444,0.690476,"[[4, 8], [5, 25]]",0.283417
3,Present Illness,0.454094,0.568182,"[[2, 11], [8, 23]]",0.627735
5,Hospital Course,0.453125,0.590909,"[[1, 11], [7, 25]]",0.379166


In [24]:
import statsmodels.api as sm

significant_pcs = {}

for col_jp, col_en in section_map.items():
    try:
        temp = df[[col_jp, 'Final BMI', 'Initial BMI']].dropna()
        if temp.shape[0] < 5:
            continue

        texts = temp[col_jp].astype(str).tolist()
        delta_bmi = (temp['Final BMI'] - temp['Initial BMI']).values

        X_bert = get_bert_embeddings(texts)
        X_pca = PCA(n_components=min(10, X_bert.shape[1]))
        X_pcs = X_pca.fit_transform(X_bert)

        X_const = sm.add_constant(X_pcs)
        ols_result = sm.OLS(delta_bmi, X_const).fit()

        # ここが重要：p値は .pvalues としてアクセス
        pvals = model.pvalues[1:]  # 定数項（intercept）を除く
        sig_indices = [i for i, p in enumerate(pvals) if p < 0.05]

        significant_pcs[col_en] = {
            'p-values': pvals.tolist(),
            'significant_PC_indices': sig_indices,
        }

    except Exception as e:
        significant_pcs[col_en] = {'Error': str(e)}

# 出力部分はそのままでOK
print("=== 有意な主成分 ===")
for sec, res in significant_pcs.items():
    print(f"\n【{sec}】")
    if 'Error' in res:
        print(f"❌ Error: {res['Error']}")
    else:
        print(f"p-values: {res['p-values']}")
        print(f"有意 (p<0.05) なPCの番号: {res['significant_PC_indices']}")


=== 有意な主成分 ===

【Chief Complaint】
❌ Error: 'BinaryResultsWrapper' object is not callable

【Family History】
❌ Error: 'BinaryResultsWrapper' object is not callable

【Social Background】
❌ Error: 'BinaryResultsWrapper' object is not callable

【Present Illness】
❌ Error: 'BinaryResultsWrapper' object is not callable

【Mental State】
❌ Error: 'BinaryResultsWrapper' object is not callable

【Hospital Course】
❌ Error: 'BinaryResultsWrapper' object is not callable


In [31]:
bert_model = AutoModel.from_pretrained(...)  # BERT用
...
ols_result = sm.OLS(...).fit()  # 回帰分析結果


OSError: Ellipsis is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`