In [None]:
import re

def text_fix(df, use_col):
    for col in use_col:
        df[col] = df[col].apply(lambda x: x.replace('\n', '。'))
        df[col] = df[col].apply(lambda x: re.sub(r'。+', '。', x))
        df[col] = df[col].apply(lambda x: x.replace('　', ''))
    return df

In [None]:
import torch
import torch.nn as nn
from transformers.trainer_utils import set_seed
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import pandas as pd
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from pprint import pprint
from datasets import Dataset
from typing import Union
from transformers import BatchEncoding, EarlyStoppingCallback
from collections import Counter
import os
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import japanize_matplotlib
import shap
from transformers import TrainerCallback, TrainingArguments, Trainer

class CustomTrainer(Trainer):
    def __init__(self, *args, criterion=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.criterion = criterion

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")[:, 1]  # positiveクラスのロジットのみを取得
        loss = self.criterion(logits, labels.float())
        return (loss, outputs) if return_outputs else loss

    # save_modelをオーバーライドして、保存前に .contiguous() を適用
    def save_model(self, output_dir=None, _internal_call=False):
        # モデルの全パラメータに対して .contiguous() を適用
        for name, param in self.model.named_parameters():
            if not param.is_contiguous():
                param.data = param.contiguous()  # 非連続テンソルを連続に変換

        # 通常の保存処理を呼び出す
        super().save_model(output_dir, _internal_call=_internal_call)

def set_random_seed(seed: int = 42):
    set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    # 再現性を保つための設定
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print("乱数シード設定完了")

def report_memory():
    print(f"Allocated: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MiB")
    print(f"Cached: {torch.cuda.memory_reserved() / 1024 ** 2:.2f} MiB")

def cleanup_gpu_memory():
    """
    GPUキャッシュを空にし、CUDAメモリをリセットし、メモリ使用状況を表示する関数。
    """
    gc.collect()
    torch.cuda.empty_cache()

    print("After cleanup:")
    report_memory()

class ContiguousCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        # モデルの全パラメータに対して .contiguous() を適用
        model = kwargs['model']
        for name, param in model.named_parameters():
            if not param.is_contiguous():
                param.data = param.contiguous()
        return control

# def load_data(train_path: str, valid_path: str):
def load_data(original_train_df, valid_df):
    train_dataset = Dataset.from_pandas(original_train_df)
    valid_dataset = Dataset.from_pandas(valid_df)
    pprint(train_dataset[0])
    return train_dataset, valid_dataset

def tokenize_data(train_dataset, valid_dataset, col, model_name: str):
    set_random_seed(42)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokens = tokenizer.tokenize(train_dataset[0][col])
    print(tokens)

    def preprocess_text_classification(example: dict[str, Union[str, int]]) -> BatchEncoding:
        # トークナイザーの最大シーケンス長を取得
        max_length = tokenizer.model_max_length
        try:
            encoded_example = tokenizer(example[col], max_length=max_length, truncation=True, padding='longest')
            encoded_example["labels"] = example["obj"]
            return encoded_example
        except Exception as e:
            print(f"Error processing example: {example}")
            print(f"Error message: {e}")
            raise e

    try:
        encoded_train_dataset = train_dataset.map(preprocess_text_classification, remove_columns=train_dataset.column_names)
        encoded_valid_dataset = valid_dataset.map(preprocess_text_classification, remove_columns=valid_dataset.column_names)
    except Exception as e:
        print(f"Error during dataset mapping: {e}")
        raise e

    print(encoded_train_dataset[0])
    return encoded_train_dataset, encoded_valid_dataset, tokenizer

# モデルの保存時に .contiguous() を適用するカスタム関数
def save_model_with_contiguous(model, save_directory):
    for name, param in model.named_parameters():
        if not param.is_contiguous():
            param.data = param.contiguous()  # 非連続なテンソルを連続に変換
    model.save_pretrained(save_directory)  # 通常の保存処理

def prepare_model(model_name: str, num_labels: int):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = (AutoModelForSequenceClassification
             .from_pretrained(model_name, num_labels=num_labels)
             .to(device))
    return model

def prepare_trainer(model, encoded_train_dataset, encoded_valid_dataset, tokenizer, epoch, samples_per_class, num_labels, ratio, output_dir: str):
    set_random_seed(42)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    pos_weight_tensor = torch.tensor([ratio]).to(device)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=2e-5,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        num_train_epochs=epoch,
        save_strategy="epoch", # エポックごとに保存
        save_total_limit=1, # 最新の1つだけを保存
        logging_strategy="epoch", # エポックごとにログ保存
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="ROC_AUC",
        fp16=True,
    )

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        accuracy = accuracy_score(labels, predictions)
        precision_weighted = precision_score(labels, predictions, average='weighted')
        recall_weighted = recall_score(labels, predictions, average='weighted')
        f1_weighted = f1_score(labels, predictions, average='weighted')
        precision_macro = precision_score(labels, predictions, average='macro')
        recall_macro = recall_score(labels, predictions, average='macro')
        f1_macro = f1_score(labels, predictions, average='macro')
        roc_auc = roc_auc_score(labels, predictions)
        return {"accuracy": accuracy, "precision_macro": precision_macro, "recall_macro": recall_macro, "f1_macro": f1_macro, "precision_weighted": precision_weighted, "recall_weighted": recall_weighted, "f1_weighted": f1_weighted, 'ROC_AUC': roc_auc}

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=encoded_train_dataset,
        eval_dataset=encoded_valid_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], # add early stopping
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
    )
    
    return trainer

def train_and_evaluate(trainer):
    set_random_seed(42)
    trainer.train()
    return trainer

def save_predictions(trainer, encoded_valid_dataset, valid_dataset, col, output_file: str):
    set_random_seed(42)
    predictions = trainer.predict(encoded_valid_dataset)
    predictions_df = pd.DataFrame({
        'obj': predictions.label_ids,
        'predicted_label': predictions.predictions.argmax(axis=1),
        'post': valid_dataset[col]
    })
    predictions_df.to_csv(output_file, index=False)

def evaluate_predictions(predictions_df, output_file: str):
    conf_matrix = confusion_matrix(predictions_df['obj'], predictions_df['predicted_label'])
    unique_labels = sorted(set(predictions_df['obj'].unique()) | set(predictions_df['predicted_label'].unique()))
    conf_matrix_df = pd.DataFrame(conf_matrix, columns=unique_labels, index=unique_labels)
    conf_matrix_df.to_csv(output_file)

    accuracy = accuracy_score(predictions_df['obj'], predictions_df['predicted_label'])
    precision = precision_score(predictions_df['obj'], predictions_df['predicted_label'], average='macro')
    recall = recall_score(predictions_df['obj'], predictions_df['predicted_label'], average='macro')
    roc_auc = roc_auc_score(predictions_df['obj'], predictions_df['predicted_label'])

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("ROC_AUC:", roc_auc)

def llm_classification(MODEL_NAME, original_train_df, valid_df, epoch, model_dir, device, type, col):
    set_random_seed(42)

    print("Initial memory usage:")
    report_memory()

    dir = f"results/{type}/{col}"
    if not os.path.exists(dir): # ディレクトリが存在するか確認
        os.makedirs(dir)

    original_train_df = original_train_df.reset_index(drop=True)
    valid_df = valid_df.reset_index(drop=True)

    train_dataset, valid_dataset = load_data(original_train_df, valid_df)
    encoded_train_dataset, encoded_valid_dataset, tokenizer = tokenize_data(train_dataset, valid_dataset, col, MODEL_NAME)

    # クラスごとのサンプル数を計算
    labels = train_dataset["obj"]
    num_classes = len(set(labels))
    print('num_classes:', num_classes)
    samples_per_class = [labels.count(i) for i in range(num_classes)]
    print('samples_per_class')
    print(samples_per_class)
    
    labels = [example["obj"] for example in train_dataset]
    num_labels = np.max(labels) + 1
    print('num_labels:', num_labels)

    model = prepare_model(MODEL_NAME, num_labels)
    # テンソルを連続化
    for param in model.parameters():
        if not param.is_contiguous():
            param.data = param.data.contiguous()

    ratio = np.sum(original_train_df['obj'].values==0)/np.sum(original_train_df['obj'].values==1)
    trainer = prepare_trainer(model, encoded_train_dataset, encoded_valid_dataset, tokenizer, epoch, samples_per_class, num_labels, ratio, "output_wrime")
    trainer = train_and_evaluate(trainer)
    save_predictions(trainer, encoded_valid_dataset, valid_dataset, col, f"{dir}/results_lmm_{type}_{col}.csv")

    predictions_df = pd.read_csv(f"{dir}/results_lmm_{type}_{col}.csv")
    evaluate_predictions(predictions_df, f"{dir}/confusion_matrix_llm_{type}_{col}.csv")

    # モデルの評価
    trainer.evaluate()

    ## 学習曲線の表示 ##

    # 学習曲線の保存
    # ログファイルから学習曲線のデータを取得
    logs = trainer.state.log_history

    # 学習と検証の損失をプロット
    train_loss = [log['loss'] for log in logs if 'loss' in log]
    eval_loss = [log['eval_loss'] for log in logs if 'eval_loss' in log]

    plt.plot(train_loss, label='Train Loss')
    plt.plot(eval_loss, label='Eval Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Evaluation Loss')

    # 図を表示
    plt.show()

    # モデルとトークナイズの保存
    model.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)

    ################################################################################################
    del model, tokenizer, trainer
    cleanup_gpu_memory()
    ################################################################################################

def make_vector_data(model, tokenizer, df, device, dir, kind, type, col):
    
    # 100行ずつ実行
    num_rows_per_df = len(df)
    dfs = [df.iloc[i:i + num_rows_per_df] for i in range(0, len(df), num_rows_per_df)]
    
    print('推論開始')
    
    # 処理後のデータフレームを保存するリスト
    processed_dfs = []
    submit_dfs = []

    max_length = tokenizer.model_max_length

    print('model_max_length:', max_length)
    
    for i, df_part in enumerate(dfs):
        set_random_seed(42)
    
        df_part = df_part.reset_index(drop=True)
        
        data_list = df_part[col].values.tolist()
        inputs = tokenizer(data_list, return_tensors='pt', max_length=max_length, truncation=True, padding='longest')
    
        # 入力データをモデルと同じデバイスに転送
        inputs = {k: v.to(device) for k, v in inputs.items()}
    
        with torch.no_grad():
            set_random_seed(42)
            outputs = model(**inputs, output_hidden_states=True)
            final_layer_vectors = outputs["hidden_states"][-1]  # 最終層のベクトルを取得
            print(f"Shape of final_layer_vectors_{kind}: {final_layer_vectors.shape}")
    
        # mean_vectors = final_layer_vectors.mean(dim=1).cpu().numpy()
        final_layer_vectors = final_layer_vectors.mean(dim=1).cpu().numpy()
        column_names = [f'dim{i}' for i in range(final_layer_vectors.shape[1])]
        df_vec = pd.DataFrame(final_layer_vectors, columns=column_names).reset_index(drop=True)

        df_vec['pid'] = df_part['pid']
        df_vec['hospitalization_date'] = df_part['hospitalization_date']
        df_vec['discharge_data'] = df_part['discharge_data']
        df_vec['train_validation_test'] = df_part['train_validation_test']
        df_vec['obj'] = df_part['obj']
    
        ################################################################################################
        del final_layer_vectors
        cleanup_gpu_memory()
        ################################################################################################
    
        
        # 予測結果取得
        logits = outputs.logits
        pred = F.softmax(logits, dim=-1)
        df_pred = pd.DataFrame(pred.cpu().numpy(), columns=['Pred_class_0', 'Pred_class_1']).reset_index(drop=True)
    
        result_df = pd.DataFrame(pred.cpu().numpy().argmax(axis=1), columns=['obj_pred']).reset_index(drop=True)
    
        df_merged = pd.concat([result_df, df_pred], axis=1)
        df_merged = pd.concat([df_merged, df_vec], axis=1)
        submit_dfs.append(df_merged)
    
        ################################################################################################
        del logits, outputs, pred, df_pred, result_df, df_part, df_merged, df_vec, inputs, data_list
        cleanup_gpu_memory()
        ################################################################################################

    df_submit = pd.concat(submit_dfs, ignore_index=True)
    display(df_submit.head())
    df_submit.to_csv(f'{dir}/BERT_vector_{kind}_{type}_{col}.csv', index=False, float_format='%.30f')
    print(f'{dir}/BERT_vector_{kind}_{type}_{col}.csvの保存が完了しました')
    
    ################################################################################################
    del submit_dfs, df_submit, dfs
    cleanup_gpu_memory()
    ################################################################################################

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

MODEL_NAME = 'tohoku-nlp/bert-base-japanese-whole-word-masking'

set_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', device)

use_col = ['初診時記録_身体所見', 'ER現病歴_身体所見']

train_mRS3_5 = df_mRS3_5[df_mRS3_5['train_validation_test']=='train']
valid_mRS3_5 = df_mRS3_5[df_mRS3_5['train_validation_test']=='valid']
test_mRS3_5 = df_mRS3_5[df_mRS3_5['train_validation_test']=='test']

train_mRS3_6 = df_mRS3_6[df_mRS3_6['train_validation_test']=='train']
valid_mRS3_6 = df_mRS3_6[df_mRS3_6['train_validation_test']=='valid']
test_mRS3_6 = df_mRS3_6[df_mRS3_6['train_validation_test']=='test']

train_mRS6 = df_mRS6[df_mRS6['train_validation_test']=='train']
valid_mRS6 = df_mRS6[df_mRS6['train_validation_test']=='valid']
test_mRS6 = df_mRS6[df_mRS6['train_validation_test']=='test']

print('データ準備完了')

# epoch数
epoch = 10

type_list = ['mRS3_5', 'mRS3_6', 'mRS6']

for type in type_list:
    for col in use_col:
        print('type:', type)
        print('use_text:', col)
        model_dir = f"models/{type}/{col}/BERT_{type}_{col}"
        
        if type == 'mRS3_5':
            train_data = train_mRS3_5.copy()
            val_data = valid_mRS3_5.copy()
            test_data = test_mRS3_5.copy()
        elif type == 'mRS3_6':
            train_data = train_mRS3_6.copy()
            val_data = valid_mRS3_6.copy()
            test_data = test_mRS3_6.copy()
        else:
            train_data = train_mRS6.copy()
            val_data = valid_mRS6.copy()
            test_data = test_mRS6.copy()

        lm_classification(MODEL_NAME, train_data, val_data, epoch, model_dir, device, type, col)

        num_labels = 2
        dir = f"results/{type}/{col}"
        
        model = (AutoModelForSequenceClassification
              .from_pretrained(model_dir, num_labels=num_labels)
              .to(device))
        tokenizer = AutoTokenizer.from_pretrained(model_dir)
        
        # テストデータに対して推論
        model.eval()
        set_random_seed(42)
        make_vector_data(model, tokenizer, train_data, device, dir, 'train', type, col)
        make_vector_data(model, tokenizer, val_data, device, dir, 'valid', type, col)
        make_vector_data(model, tokenizer, test_data, device, dir, 'test', type, col)
        
        ################################################################################################
        del model, tokenizer
        cleanup_gpu_memory()
        ################################################################################################