data

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
import matplotlib.pyplot as plt



In [22]:
# Load dataset
file_path = "D:/★낭파일/FINAL_DATASET_USE.csv"
df = pd.read_csv(file_path, encoding='utf-8')

# Display dataset information
print(df.head())
print(f"Data Shape: {df.shape}")

   PERSON_ID  RECU_FR_DT       KEY_SEQ                      sick_all  \
0   10037780    20060126  200602575138                           E11   
1   10037780    20060408  200606628983      M255-K580-K297-J302-J440   
2   10037780    20060520  200608497156                           R81   
3   10037780    20060214  200604344934  E11-K580-M255-J302-J440-K297   
4   10037780    20051101  200513215703      M255-K580-K297-J302-J440   

          drug                  Treatment  Cardiovascular_Disease  Neuropathy  \
0  AA254-AL801            KK010-A01401221                       0           0   
1  AA254-AL801  KK010-A17600771-A03502651                       0           0   
2  AA254-AL801            KK010-A03502651                       0           0   
3  AA254-AL801            KK010-A03502651                       0           0   
4  AA254-AL801  KK010-A20750491-A01401221                       0           0   

   Peripheral_Vascular_Disease  Cerebrovascular_Disease  Eye_Disease  \
0       

In [23]:
# Preprocess Data for Multilabel Binary Classification
# Convert RECU_FR_DT to datetime
df['timestamp'] = pd.to_datetime(df['RECU_FR_DT'], format='%Y%m%d')
df = df.sort_values(by=['PERSON_ID', 'timestamp'])

# Define complications
complication_columns = [
    'Cardiovascular_Disease', 'Neuropathy', 'Peripheral_Vascular_Disease',
    'Cerebrovascular_Disease', 'Eye_Disease', 'Kidney_Disease'
]

In [24]:
# Filter data: Use only records before the first occurrence of each complication and adjust complication variables
def filter_and_adjust_data(df, complication_columns):
    filtered_data = []
    for person_id, group in df.groupby('PERSON_ID'):
        group = group.copy()
        for col in complication_columns:
            if group[col].sum() > 0:  # Check if the complication occurs
                try:
                    first_occurrence_idx = group[group[col] == 1].index[0]
                    group = group.loc[:first_occurrence_idx - 1]  # Keep records before the first occurrence
                    group[col] = 1  # Set all rows for this complication to 1 for this patient
                except KeyError:
                    print(f"KeyError for PERSON_ID={person_id}, column={col}")
                    continue
            else:
                group[col] = 0  # If complication never occurred, set all rows to 0
        filtered_data.append(group)
    return pd.concat(filtered_data, ignore_index=True)

df = filter_and_adjust_data(df, complication_columns)

KeyError for PERSON_ID=11041441, column=Eye_Disease
KeyError for PERSON_ID=12578132, column=Cerebrovascular_Disease
KeyError for PERSON_ID=13174014, column=Peripheral_Vascular_Disease
KeyError for PERSON_ID=14836440, column=Peripheral_Vascular_Disease
KeyError for PERSON_ID=15619211, column=Eye_Disease
KeyError for PERSON_ID=15946398, column=Neuropathy
KeyError for PERSON_ID=16240534, column=Cerebrovascular_Disease
KeyError for PERSON_ID=16990477, column=Eye_Disease
KeyError for PERSON_ID=17649842, column=Peripheral_Vascular_Disease
KeyError for PERSON_ID=18565289, column=Peripheral_Vascular_Disease
KeyError for PERSON_ID=19150771, column=Eye_Disease
KeyError for PERSON_ID=19481243, column=Peripheral_Vascular_Disease
KeyError for PERSON_ID=19805932, column=Eye_Disease
KeyError for PERSON_ID=20758506, column=Kidney_Disease
KeyError for PERSON_ID=20778497, column=Peripheral_Vascular_Disease
KeyError for PERSON_ID=21839282, column=Peripheral_Vascular_Disease
KeyError for PERSON_ID=2303363

In [25]:
# Display dataset information
print(df.head())
print(f"Data Shape: {df.shape}")

   PERSON_ID  RECU_FR_DT       KEY_SEQ             sick_all  \
0   10037780    20050429  200506040901        M255-E11-K297   
1   10037780    20050520  200506848618  K297-M255-J302-J440   
2   10037780    20050607  200508169905  K297-M255-J302-J440   
3   10037780    20050706  200509032268  K297-J302-M255-J440   
4   10037780    20050725  200509032268                 K580   

                drug                  Treatment  Cardiovascular_Disease  \
0  AA154-AA254-AL801  KK010-A03005211-A01401221                       1   
1        AA254-AL801            KK010-A01401221                       1   
2        AA254-AL801            KK010-A01401221                       1   
3        AA254-AL801            KK010-A01401221                       1   
4        AA254-AL801            KK010-A01401221                       1   

   Neuropathy  Peripheral_Vascular_Disease  Cerebrovascular_Disease  \
0           1                            0                        0   
1           1               

In [26]:
# Word2Vec Embedding
columns_to_encode = ['sick_all', 'drug', 'Treatment']
combined_codes = [
    code.split('-') for col in columns_to_encode for code in df[col].dropna()
]
word2vec_model = Word2Vec(sentences=combined_codes, vector_size=50, window=5, min_count=1, sg=1, epochs=50)

In [27]:
# Encode vectors
def encode_vectors(df, word2vec_model, column_name):
    vectors = []
    for codes in df[column_name]:
        vector = np.mean(
            [word2vec_model.wv[code] for code in codes.split('-') if code in word2vec_model.wv],
            axis=0
        )
        vectors.append(vector)
    return torch.tensor(vectors, dtype=torch.float32)

encoded_sick_all = encode_vectors(df, word2vec_model, 'sick_all')
encoded_drug = encode_vectors(df, word2vec_model, 'drug')
encoded_treatment = encode_vectors(df, word2vec_model, 'Treatment')
encoded_data = torch.cat((encoded_sick_all, encoded_drug, encoded_treatment), dim=1)

y = torch.tensor(df[complication_columns].values, dtype=torch.float32)

  return torch.tensor(vectors, dtype=torch.float32)


In [34]:
# Patient2Vec Model for Multilabel Binary Classification
class Patient2Vec(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers, att_dim, output_size, hops, bi=True, dropout_p=0.5):
        super(Patient2Vec, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.hops = hops
        self.bi = bi
        self.rnn = nn.GRU(input_size, hidden_size, n_layers, bidirectional=bi, batch_first=True, dropout=dropout_p)
        self.att_w = nn.Linear(hidden_size * (2 if bi else 1), att_dim)
        self.att_u = nn.Linear(att_dim, hops)
        self.linear = nn.Linear(hidden_size * (2 if bi else 1) * hops, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, inputs):
        batch_size, seq_len, _ = inputs.size()
        rnn_out, _ = self.rnn(inputs)  # Shape: (batch_size, seq_len, hidden_size * (2 if bi else 1))
        att_w = torch.tanh(self.att_w(rnn_out))  # Shape: (batch_size, seq_len, att_dim)
        att_u = self.att_u(att_w).transpose(1, 2)  # Shape: (batch_size, hops, seq_len)
        att_weights = torch.softmax(att_u, dim=2)  # Normalize over time steps

        # Adjust dimensions for context vector calculation
        context = torch.bmm(att_weights, rnn_out)  # Shape: (batch_size, hops, hidden_size * (2 if bi else 1))
        context = context.view(batch_size, -1)  # Flatten for fully connected layer

        output = torch.sigmoid(self.linear(context))  # Final output
        return output, att_weights

In [66]:
# Model initialization
input_size = encoded_data.size(1)
hidden_size = 32
n_layers = 2
att_dim = 32
output_size = y.size(1)  # Number of complications
hops = 4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Patient2Vec(input_size, hidden_size, n_layers, att_dim, output_size, hops).to(device)

# Training setup
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Create DataLoader
data_loader = DataLoader(list(zip(encoded_data, y)), batch_size=64, shuffle=True)

In [67]:
# Training loop
for epoch in range(20):  # Example epoch count
    model.train()
    epoch_loss = 0
    for inputs, labels in data_loader:
        inputs = inputs.view(inputs.size(0), -1, input_size).to(device)  # Ensure 3D input (batch_size, seq_len, input_size)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs, _ = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")

# Evaluation metrics
def evaluate_model(model, data_loader):
    model.eval()
    all_outputs, all_labels = [], []
    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs = inputs.view(inputs.size(0), -1, input_size).to(device)  # Ensure 3D input
            labels = labels.to(device)
            outputs, _ = model(inputs)
            all_outputs.append(outputs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    y_pred = np.vstack(all_outputs)
    y_true = np.vstack(all_labels)
    return {
        "AUC": roc_auc_score(y_true, y_pred, average='macro'),
        "F1": f1_score(y_true, (y_pred > 0.5).astype(int), average='macro'),
        "Precision": precision_score(y_true, (y_pred > 0.5).astype(int), average='macro'),
        "Recall": recall_score(y_true, (y_pred > 0.5).astype(int), average='macro')
    }

Epoch 1, Loss: 1459.1831
Epoch 2, Loss: 1433.1174
Epoch 3, Loss: 1424.2299
Epoch 4, Loss: 1418.8243
Epoch 5, Loss: 1414.6155
Epoch 6, Loss: 1409.8528
Epoch 7, Loss: 1404.8722
Epoch 8, Loss: 1401.0655
Epoch 9, Loss: 1396.4886
Epoch 10, Loss: 1392.1849
Epoch 11, Loss: 1388.4572
Epoch 12, Loss: 1384.8552
Epoch 13, Loss: 1381.5531
Epoch 14, Loss: 1378.5787
Epoch 15, Loss: 1375.1725
Epoch 16, Loss: 1373.0102
Epoch 17, Loss: 1370.4746
Epoch 18, Loss: 1367.9911
Epoch 19, Loss: 1365.6926
Epoch 20, Loss: 1364.3132


In [68]:
# Evaluate and visualize
metrics = evaluate_model(model, data_loader)
print(metrics)

{'AUC': 0.6861693426521399, 'F1': 0.08983804811093822, 'Precision': 0.7833090475860077, 'Recall': 0.050560659466917755}


NEW MODEL TEST

In [69]:
from pytorch_tabnet.tab_model import TabNetClassifier

In [108]:
# Combine embeddings with complications
df_encoded = pd.DataFrame(
    encoded_data.view(num_samples, -1).cpu().numpy(),
    columns=[f"embedding_{i}" for i in range(encoded_data.size(1) * encoded_data.size(2))]
)
df_encoded['PERSON_ID'] = df['PERSON_ID'].values

# Add complications
for col in complication_columns:
    df_encoded[col] = df[col].values

print(df_encoded.head())
print(f"Final Encoded Data Shape with Complications: {df_encoded.shape}")


   embedding_0  embedding_1  embedding_2  embedding_3  embedding_4  \
0     0.362635     0.739401     0.424431     0.637805     0.443612   
1     0.279705     0.633608     0.362313     0.494067     0.401619   
2     0.279705     0.633608     0.362313     0.494067     0.401619   
3     0.279705     0.633608     0.362313     0.494067     0.401619   
4     0.337995     0.641400     0.413080     0.680473     0.522285   

   embedding_5  embedding_6  embedding_7  embedding_8  embedding_9  ...  \
0     0.383196     0.410649     0.650798     0.446770     0.450827  ...   
1     0.328526     0.480369     0.508390     0.517885     0.450338  ...   
2     0.328526     0.480369     0.508390     0.517885     0.450338  ...   
3     0.328526     0.480369     0.508390     0.517885     0.450338  ...   
4     0.546734     0.325973     0.392710     0.540406     0.387972  ...   

   embedding_147  embedding_148  embedding_149  PERSON_ID  \
0       0.368398       0.481946       0.672105   10037780   
1     

In [109]:
# Load dataset
df_new = pd.read_csv("D:/★낭파일/QUERY_FOR_FINAL_DM_COMPLICATIONS.csv", encoding='euc-kr') 


# Display dataset information
print(df_new.head())
print(f"Data Shape: {df_new.shape}")

   PERSON_ID  age_group_new  SEX residential_area incom_quantiles  \
0   10037780            6.8    1          2.rural           2.4-6   
1   10085147            5.7    1          1.urban          3.7-10   
2   10108000            3.5    2          2.rural          3.7-10   
3   10145452            3.5    2          2.rural          3.7-10   
4   10176539            4.6    2          2.rural          3.7-10   

  Duration_of_DM_diagnosis Medication_possession_rate           bmi  \
0                    2.4~7                     2.>=80  3. 23.0~24.9   
1                    2.4~7                     2.>=80        4.>=25   
2                    2.4~7                     2.>=80  3. 23.0~24.9   
3                   4.>=10                     2.>=80        4.>=25   
4                   3.7~10                     2.>=80        4.>=25   

  Fasting_blood_sugar Alcohol_consumption Physical_activity  \
0             2.>=100   2.1~4 times/month            1.None   
1             2.>=100      1.1 t

In [117]:
# Merge new data with existing data on PERSON_ID
df_combined = pd.merge(df_encoded, df_new, on='PERSON_ID', how='inner')

# Display dataset information
print(df_combined.head())
print(f"Data Shape: {df_combined.shape}")

# Display all column names and their respective data types
print("\nColumn Names and Data Types:")
for column in df_combined.columns:
    print(f"{column}: {df_combined[column].dtype}")

# Count the number of columns
print(f"\nTotal number of columns: {len(df_combined.columns)}")

   embedding_0  embedding_1  embedding_2  embedding_3  embedding_4  \
0     0.362635     0.739401     0.424431     0.637805     0.443612   
1     0.279705     0.633608     0.362313     0.494067     0.401619   
2     0.279705     0.633608     0.362313     0.494067     0.401619   
3     0.279705     0.633608     0.362313     0.494067     0.401619   
4     0.337995     0.641400     0.413080     0.680473     0.522285   

   embedding_5  embedding_6  embedding_7  embedding_8  embedding_9  ...  \
0     0.383196     0.410649     0.650798     0.446770     0.450827  ...   
1     0.328526     0.480369     0.508390     0.517885     0.450338  ...   
2     0.328526     0.480369     0.508390     0.517885     0.450338  ...   
3     0.328526     0.480369     0.508390     0.517885     0.450338  ...   
4     0.546734     0.325973     0.392710     0.540406     0.387972  ...   

   Medication_possession_rate           bmi  Fasting_blood_sugar  \
0                      2.>=80  3. 23.0~24.9              2.>

In [122]:
# Target 설정
target_columns = [
    'Cardiovascular_Disease', 'Neuropathy', 'Peripheral_Vascular_Disease',
    'Cerebrovascular_Disease', 'Eye_Disease', 'Kidney_Disease'
]

In [123]:
# Feature와 Target 분리
features = [col for col in df_combined.columns if col.startswith('embedding_')]  # Embedding 변수들
X = df_combined[features].values
y = df_combined[target_columns].values

In [125]:
from sklearn.preprocessing import LabelEncoder
# Categorical 변수 인코딩
categorical_columns = [
    'age_group_new', 'SEX', 'residential_area', 'incom_quantiles',
    'Medication_possession_rate', 'Alcohol_consumption', 'Physical_activity',
    'Systolic_blood_pressure', 'Diastolic_blood_pressure', 'Total_cholesterol',
    'Proteinuria', 'Smoking'
]
for col in categorical_columns:
    if col in df_combined.columns:
        le = LabelEncoder()
        df_combined[col] = le.fit_transform(df_combined[col].astype(str))

In [126]:
# Categorical Features 추가
categorical_features = df_combined[categorical_columns].values
X = np.hstack([X, categorical_features])  # 임베딩 + 추가 Feature 결합

In [131]:
print("Predicted class distribution in training data:")
print(np.unique(np.argmax(y_multiclass, axis=1), return_counts=True))


Predicted class distribution in training data:
(array([0], dtype=int64), array([220572], dtype=int64))


In [133]:
# 각 레이블에 대해 독립적인 TabNet 모델 학습
from sklearn.metrics import accuracy_score
models = {}
predictions = {}

for i, label in enumerate(target_columns):
    print(f"Training TabNet for label: {label}")
    
    # TabNetClassifier 초기화
    model = TabNetClassifier()
    model.fit(
        X_train=X,
        y_train=y_multiclass[:, i],  # 각 레이블의 값
        max_epochs=10,
        batch_size=1024,
        virtual_batch_size=128,
        num_workers=0,
        drop_last=False
    )
    # 모델 저장
    models[label] = model
    
    # 예측값 저장
    y_pred = model.predict(X)
    predictions[label] = y_pred
    print(f"Accuracy for {label}: {accuracy_score(y_multiclass[:, i], y_pred)}")

Training TabNet for label: Cardiovascular_Disease




epoch 0  | loss: 0.00015 |  0:00:11s
epoch 1  | loss: 2e-05   |  0:00:22s
epoch 2  | loss: 0.0     |  0:00:34s
epoch 3  | loss: 0.0     |  0:00:45s
epoch 4  | loss: 0.0     |  0:01:01s
epoch 5  | loss: 0.0     |  0:01:13s
epoch 6  | loss: 0.0     |  0:01:24s
epoch 7  | loss: 0.0     |  0:01:35s
epoch 8  | loss: 0.0     |  0:01:46s
epoch 9  | loss: 0.0     |  0:01:58s
Accuracy for Cardiovascular_Disease: 1.0
Training TabNet for label: Neuropathy




epoch 0  | loss: 0.59515 |  0:00:12s
epoch 1  | loss: 0.55435 |  0:00:24s
epoch 2  | loss: 0.54098 |  0:00:36s
epoch 3  | loss: 0.52846 |  0:00:48s
epoch 4  | loss: 0.50863 |  0:01:00s
epoch 5  | loss: 0.48959 |  0:01:12s
epoch 6  | loss: 0.47149 |  0:01:24s
epoch 7  | loss: 0.45193 |  0:01:36s
epoch 8  | loss: 0.42058 |  0:01:49s
epoch 9  | loss: 0.3884  |  0:02:02s
Accuracy for Neuropathy: 0.8421830513392452
Training TabNet for label: Peripheral_Vascular_Disease




IndexError: index 2 is out of bounds for axis 1 with size 2

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

# 모델 평가 함수 정의
def evaluate_multiclass_to_multilabel(model, X, y_true_multilabel, mlb):
    """
    멀티클래스 모델을 평가하고 멀티라벨로 변환하여 성능을 평가합니다.
    Args:
        model: 학습된 TabNet 모델
        X: Feature 데이터
        y_true_multilabel: 실제 멀티라벨 데이터 (원본 형식)
        mlb: MultiLabelBinarizer 객체
    Returns:
        metrics: 전체 멀티라벨 평가 결과 딕셔너리
    """
    # 멀티클래스 예측
    y_pred_multiclass = model.predict(X)
    
    # 멀티라벨로 변환
    y_pred_multilabel = mlb.inverse_transform(y_pred_multiclass)
    y_pred_multilabel_binary = mlb.transform(y_pred_multilabel)
    y_true_binary = mlb.transform(y_true_multilabel)

    # 평가 지표 계산
    metrics = {
        "Accuracy": accuracy_score(y_true_binary, y_pred_multilabel_binary),
        "AUC": roc_auc_score(y_true_binary, model.predict_proba(X), average="macro"),
        "F1": f1_score(y_true_binary, y_pred_multilabel_binary, average="macro"),
        "Precision": precision_score(y_true_binary, y_pred_multilabel_binary, average="macro"),
        "Recall": recall_score(y_true_binary, y_pred_multilabel_binary, average="macro"),
    }
    return metrics

# 모델 성능 평가 실행
metrics = evaluate_multiclass_to_multilabel(tabnet_model, X, df_combined[target_columns].values.tolist(), mlb)
print("Model Evaluation Metrics:")
for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value:.4f}")