<a href="https://colab.research.google.com/github/mitsuo/juntendo-hds/blob/main/ECG_torch_residual_%E6%94%B9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ライブラリ読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, glob, pickle, time, gc, copy, sys
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100) # 表示できる表の列数

# 心電図データのダウンロード
今回は 全国医療AIコンテスト 2021 (https://www.kaggle.com/competitions/ai-medical-contest-2021/ ) の心電図データをダウンロードします。このデータは心電図から心筋梗塞かどうかを判定するタスクのためのものです。

まずデータをダウンロードしましょう。下のセルでは `ai-medical-contest-2021` というディレクトリをデフォルトのディレクトリ `/content` の下に作成し、そこに関連データをダウンロードしています。

In [None]:
# ! を先頭につけると一時的に適応される。例えばワーキングディレクトリの移動をしてもその後のコマンドには適応されない。
# ai-medical-contest-2021 ディレクトリを作成します。
!rm -rf /content/ai-medical-contest-2021
!mkdir /content/ai-medical-contest-2021

# % を先頭につけると永続的に適応される。ワーキングディレクトリの移動をしてもその後も適応される。
# ai-medical-contest-2021 ディレクトリに移動します。
%cd /content/ai-medical-contest-2021
!pwd
!ls

# 心電図データのダウンロード
!wget http://mitsuo.nishizawa.com/juntendo/ai-medical-contest-2021.zip
!unzip ai-medical-contest-2021.zip
!ls

In [None]:
# trainファイルを読み込む
df_train = pd.read_csv("/content/ai-medical-contest-2021/train.csv")
print("df_train.shape", df_train.shape) # シェイプ = (行数, 列数)を表示する

# testファイルを読み込む
df_test = pd.read_csv("/content/ai-medical-contest-2021//test.csv")
print("df_test.shape", df_test.shape) # シェイプ = (行数, 列数)を表示する

# submissionファイルを読み込む
df_sub = pd.read_csv("/content/ai-medical-contest-2021//sample_submission.csv")
print("df_sub.shape", df_sub.shape) # シェイプ = (行数, 列数)を表示する

# ECGデータのpathの列を追加.
df_train['path'] = df_train['Id'].apply(lambda x: "/content/ai-medical-contest-2021/ecg/{}.npy".format(x))
df_test['path'] = df_test['Id'].apply(lambda x: "/content/ai-medical-contest-2021/ecg/{}.npy".format(x))
print(df_train['path'][0]) # path列の0行目を表示

# trainとtestを連結する
df_traintest = pd.concat([df_train, df_test]).reset_index(drop=True) # reset_index: 行のindexをリセットする
print(df_traintest.shape)

col_target = 'target' # ターゲットの列
col_index = 'Id' # idの列
print("rate of positive: {:.6f}".format(df_train[col_target].mean())) # targetが1である割合

# 各列の基本情報を表示
# 解析対象はtrain+test
# 列名, 型, nanの数, uniqueな値の数, 実際の値の一部, を表示する
df_tmp = df_traintest  # 解析するDataFrameを指定
for i, col in enumerate(df_tmp.columns): # 各列(column)について
    col_name = col + " " * (22 - len(col)) # カラム名, 見た目上の整形のためにスペースを加える
    type_name = "{}".format(df_tmp[col].dtype) # 型名
    type_name = type_name + " " * (8 - len(type_name)) # 見た目上の整形のためにスペースを加える
    num_unique = len(df_tmp[col].unique()) # ユニークな値の数
    num_nan = pd.isna(df_tmp[col]).sum() # nanの数
    col_head = "{}".format(df_tmp[col].unique()[:5].tolist())[:40] # 実際の値の一部
    print("{:4d}: {} dtype: {} unique: {:8d}, nan: {:6d}, 実際の値: {}".format(
        i, col_name, type_name, len(df_tmp[col].unique()), num_nan, col_head)) # 表示する

# カテゴリ変数をラベルエンコーディングする (数値に置き換える).
df_traintest['sex'] = df_traintest['sex'].replace('female', 0) # femaleに0を代入
df_traintest['sex'] = df_traintest['sex'].replace('male', 1) # maleに1を代入
df_traintest['sex'] = df_traintest['sex'].astype(int) # 型を整数に変換

df_traintest['label_type'] = df_traintest['label_type'].replace('human', 0) # humanに0を代入
df_traintest['label_type'] = df_traintest['label_type'].replace('auto', 1) # autoに1を代入
df_traintest['label_type'] = df_traintest['label_type'].astype(int) # 型を整数に変換

# train と test を再度切り分ける
df_train = df_traintest.iloc[:len(df_train)]
df_test = df_traintest.iloc[len(df_train):].reset_index(drop=True)

# 全てのECGデータを読み込む
ecg_train = np.zeros([len(df_train), 800, 12], np.float32) # trainの心電図データの代入先. shape=(データ数, 時間方向, 12誘導)
for i in range(len(df_train)): # 全てのtrain dataについて
    path_tmp = df_train['path'][i] # i行目の心電図データのpath
    ecg_tmp = np.load(path_tmp) # i行目の心電図データ
    ecg_train[i] = ecg_tmp # 読み込んだ心電図データをecg_trainのi行目に代入

ecg_test = np.zeros([len(df_test), 800, 12], np.float32) # testの心電図データの代入先. shape=(データ数, 時間方向, 12誘導)
for i in range(len(df_test)): # 全てのtest dataについて
    path_tmp = df_test['path'][i] # i行目の心電図データのpath
    ecg_tmp = np.load(path_tmp) # i行目の心電図データ
    ecg_test[i] = ecg_tmp # 読み込んだ心電図データをecg_trainのi行目に代入
print("ecg_train.shape: {}".format(ecg_train.shape))
print("ecg_test.shape: {}".format(ecg_test.shape))

# target情報をnumpy形式に変換
target_train = df_train[col_target].values.astype(int) # pandas.Seriesからnp.ndarrayへ変換
print("target_train.shape: {}".format(target_train.shape))

In [None]:
# クロスバリデーションを行うためにデータを5分割する
# 4つを学習に用い、1つを検証に要する。これを5回繰り返す。
folds = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(
    np.arange(len(df_train)),
    y=df_train[col_target]) # 各foldターゲットのラベルの分布がそろうようにする = stratified K fold
)

# fold 0の学習データと検証データの分割
fold = 0 # fold 0 についての学習を行う

# このfoldにおける学習データと検証データの切り分け
X_train = ecg_train[folds[fold][0]] # 学習データの入力データを抽出
y_train = target_train[folds[fold][0]] # 学習データの正解データを抽出
X_valid = ecg_train[folds[fold][1]] # 検証データの入力データを抽出
y_valid = target_train[folds[fold][1]] # 検証データの正解データを抽出
print("X_train.shape: {}, X_valid.shape: {}".format(X_train.shape, X_valid.shape))
print("y_train.shape: {}, y_valid.shape: {}".format(y_train.shape, y_valid.shape))

## pytorch

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.metrics import roc_auc_score

from tqdm import tqdm_notebook as tqdm


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
input_size = 700

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data, label):
        super().__init__()

        self.data = data
        self.label = label
        self.len = data.shape[0]

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        out_data = self.data
        out_data = out_data[index]

        start_idx = np.random.randint(0,800-input_size-1)
        out_data = out_data[start_idx:start_idx+input_size:,:]
        out_label = self.label[index]

        return out_data, out_label

In [None]:
class Net1D(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(12, 64, kernel_size=7, stride=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool1d(2)

        self.conv2 = nn.Conv1d(64, 128, kernel_size=3, stride=2)
        self.bn2 = nn.BatchNorm1d(128)

        self.conv3 = nn.Conv1d(128,256,kernel_size=3, stride=2)
        self.bn3 = nn.BatchNorm1d(256)
        self.gap = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(256,1)


    def forward(self,x):
#         s1, s2, s3 = x.shape
#         x = x.reshape(s1, s3, s2)
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu(x)
        x = self.gap(x)
        x = x.view(x.size(0),-1)
        x = self.fc(x)
        x = x.view(-1)

        return x

In [None]:
# https://github.com/eddymina/ECG_Classification_Pytorch/blob/master/ECG_notebook.ipynb

import torch.nn as nn
import torch.nn.functional as F

print("""\nA 1D CNN is very effective when you expect to derive interesting features from shorter
(fixed-length) segments of the overall data set and where the location of the feature
within the segment is not of high relevance.\n""")

class Anomaly_Classifier(nn.Module):
    def __init__(self, num_classes):
        super(Anomaly_Classifier, self).__init__()

        self.conv= nn.Conv1d(in_channels=12, out_channels=32, kernel_size=5,stride=1)

        self.conv_pad = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=5,stride=1,padding=2)
        self.drop_50 = nn.Dropout(p=0.5)

        self.maxpool = nn.MaxPool1d(kernel_size=5,stride=2)

#         self.dense1 = nn.Linear(32 * 8, 32)
        self.dense1 = nn.Linear(1280, 32)
        self.dense2 = nn.Linear(32, 32)

        self.dense_final = nn.Linear(32, num_classes)

    def forward(self, x):
        x = x.permute(0, 2, 1)

        residual= self.conv(x)

        #block1
        x = F.relu(self.conv_pad(residual))
        x = self.conv_pad(x)
        x+= residual
        x = F.relu(x)
        residual = self.maxpool(x) #[512 32 90]

        #block2
        x=F.relu(self.conv_pad(residual))
        x=self.conv_pad(x)
        x+=residual
        x= F.relu(x)
        residual = self.maxpool(x) #[512 32 43]


        #block3
        x=F.relu(self.conv_pad(residual))
        x=self.conv_pad(x)
        x+=residual
        x= F.relu(x)
        residual = self.maxpool(x) #[512 32 20]


        #block4
        x=F.relu(self.conv_pad(residual))
        x=self.conv_pad(x)
        x+=residual
        x= F.relu(x)
        x= self.maxpool(x) #[512 32 8]

        s1, s2, s3 = x.shape

        #MLP
        x = x.view(-1, s2 * s3) #Reshape (current_dim, 32*2)
#         print(x.shape)

        x = F.relu(self.dense1(x))
        #x = self.drop_60(x)
        x= self.dense2(x)
        x = self.dense_final(x)
        x = x.view(-1)
        return x

In [None]:
model = Net1D()
print(model)

total_params = sum(p.numel() for p in model.parameters())
print(total_params)

In [None]:
cv = 0
n_splits = 5
optimizer_name = 'Adam'
lr = 0.001
EPOCHS=30

list_weights = []
best_preds_list = []
valid_label_list = []
for fold in range(n_splits):
    X_train = ecg_train[folds[fold][0]] # 学習データの入力データを抽出
    y_train = target_train[folds[fold][0]] # 学習データの正解データを抽出
    X_valid = ecg_train[folds[fold][1]] # 検証データの入力データを抽出
    y_valid = target_train[folds[fold][1]] # 検証データの正解データを抽出

    X_train = torch.FloatTensor(X_train).to(device)
    y_train = torch.FloatTensor(y_train).to(device)
    X_valid = torch.FloatTensor(X_valid).to(device)
    y_valid = torch.FloatTensor(y_valid).to(device)

    dataset = MyDataset(X_train, y_train)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)

    dataset_val = MyDataset(X_valid, y_valid)
    dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=64, shuffle=False)

#     model = Net1D().cuda()
    model = Anomaly_Classifier(num_classes= 1).to(device)
    optimizer = getattr(torch.optim, optimizer_name)(model.parameters(), lr=lr)
    criterion = nn.BCEWithLogitsLoss()

    best_auc = 0
    for e in range(EPOCHS):
        avg_loss = 0
        model.train()
        for i, (data, y_target) in enumerate(dataloader):
            optimizer.zero_grad()
            y_pred = model(data)
            loss = criterion(y_pred, y_target)
            loss.backward()
            optimizer.step()

            avg_loss += loss.item() / len(dataloader)

        model.eval()
        avg_val_loss = 0.
        valid_labels = []
        preds = []
        with torch.no_grad():
            for i, (data, y_target) in enumerate(dataloader_val):
                y_pred = model(data)
                valloss = criterion(y_pred, y_target)
                avg_val_loss += valloss.item() / len(dataloader_val)
                valid_labels.append(y_target.to('cpu').numpy())
                preds.append(F.sigmoid(y_pred).cpu().numpy())
        preds = np.concatenate(preds)
        valid_labels = np.concatenate(valid_labels)
        val_auc = roc_auc_score(valid_labels,preds[:])

        if e % 1 == 0:
            print('E {}: train loss: {} val loss: {} val AUC: {}'.format(
                e, avg_loss, avg_val_loss, val_auc))

        if best_auc < val_auc:
            best_auc = val_auc
            best_preds = preds
            print(f'  Epoch {e} - Save Best AUC: {best_auc:.4f}')
            best_weight = model.state_dict()

    list_weights.append(best_weight)
    best_preds_list.append(best_preds)
    valid_label_list.append(valid_labels)

## calc oof
best_preds_list = np.concatenate(best_preds_list)
valid_label_list = np.concatenate(valid_label_list)
oof_auc = roc_auc_score(valid_labels,preds[:])
print(f"OOF_AUC{oof_auc}")

In [None]:
preds_test = np.zeros([n_splits, len(df_test)], np.float32) # 予測結果の代入先
for fold, w in tqdm(enumerate(list_weights)):
#     model = Anomaly_Classifier(input_size=12, num_classes= 1).to(device)
    list_test = []

    X_test = torch.FloatTensor(ecg_test).to(device)

    dataset_test = MyDataset(X_test, np.zeros(X_test.shape[0]))
    dataloader_test = torch.utils.data.DataLoader(dataset_test, batch_size=64, shuffle=False)

#     model = Net1D().to(device)
    model = Anomaly_Classifier(num_classes= 1).to(device)
    model.load_state_dict(w)
    model.eval()
    l_p = []
    with torch.no_grad():
        for i, (data, y_target) in enumerate(dataloader_test):
            y_p = model(data)
            y_p = F.sigmoid(y_p).cpu().numpy()
            l_p.append(y_p)
    y_pred = np.concatenate(l_p)
    preds_test[fold] = y_pred

In [None]:
import seaborn as sns
%matplotlib inline

In [None]:
sns.heatmap(preds_test)

In [None]:
### submitファイルを作成
preds_test_mean = preds_test.mean(axis=0) # 各foldのmodelの予測の平均値を最終的な予測結果として採用する
print("preds_test_mean.shape: {}".format(preds_test_mean.shape))
df_sub[col_target] = preds_test.mean(axis=0) # 推定結果を代入
# df_sub[col_target] = preds_test[4]
df_sub.to_csv("submission.torch.csv", index=None) # submitファイルを保存
df_sub.head() # 最初の5行を表示