## **food-drug interaction text 분류 nlp 모델 탐색**

##### 환경설정 및 기본 데이터 처리

In [1]:
!pip install PyTDC transformers torch pandas scikit-learn matplotlib seaborn
!pip install sentence-transformers plotly numpy



In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# TDC 라이브러리 import
try:
    from tdc.multi_pred import DDI
    from tdc.utils import get_label_map
    print("TDC library loaded successfully")
except ImportError:
    print("Installing TDC...")
    !pip install PyTDC
    from tdc.multi_pred import DDI
    from tdc.utils import get_label_map

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

TDC library loaded successfully
Using device: cuda


###### DrugBank와 TWOSIDES 라벨 정보 가져오기

In [3]:
print("Loading actual DrugBank and TWOSIDES data and labels from TDC...")

try:
    # DrugBank 데이터 로드
    print("Loading DrugBank dataset...")
    drugbank_data = DDI(name='DrugBank')
    drugbank_split = drugbank_data.get_split()

    # DrugBank 라벨 정보
    drugbank_labels = get_label_map(name='DrugBank', task='DDI')
    print(f"✓ DrugBank loaded: {len(drugbank_labels)} interaction types")

    # TWOSIDES 데이터 로드
    print("Loading TWOSIDES dataset...")
    twosides_data = DDI(name='TWOSIDES')
    twosides_split = twosides_data.get_split()

    # TWOSIDES 라벨 정보
    twosides_labels = get_label_map(name='TWOSIDES', task='DDI', name_column='Side Effect Name')
    print(f"✓ TWOSIDES loaded: {len(twosides_labels)} side effect types")

    tdc_loaded = True

except Exception as e:
    print(f"TDC loading failed: {e}")
    print("Falling back to manual label definition...")
    tdc_loaded = False

if tdc_loaded:
    print("\nDrugBank Labels (first 10):")
    for idx, (key, label) in enumerate(drugbank_labels.items()):
        if idx >= 10:
            break
        print(f"{idx}: {key} -> {label}")

    print(f"\nTWOSIDES Labels (first 10):")
    for idx, (key, label) in enumerate(twosides_labels.items()):
        if idx >= 10:
            break
        print(f"{idx}: {key} -> {label}")

    # DrugBank 데이터 구조 확인
    print(f"\nDrugBank data structure:")
    print(f"Train set: {len(drugbank_split['train'])}")
    print(f"Valid set: {len(drugbank_split['valid'])}")
    print(f"Test set: {len(drugbank_split['test'])}")
    print(f"Columns: {drugbank_split['train'].columns.tolist()}")

    # TWOSIDES 데이터 구조 확인
    print(f"\nTWOSIDES data structure:")
    print(f"Train set: {len(twosides_split['train'])}")
    print(f"Valid set: {len(twosides_split['valid'])}")
    print(f"Test set: {len(twosides_split['test'])}")
    print(f"Columns: {twosides_split['train'].columns.tolist()}")

    # 샘플 데이터 확인
    print(f"\nDrugBank sample data:")
    print(drugbank_split['train'].head(3))

    print(f"\nTWOSIDES sample data:")
    print(twosides_split['train'].head(3))

# else:
#     # Fallback: 수동으로 정의된 라벨 사용
#     print("Using manually defined labels...")
#     # 여기에 이전에 작성한 라벨 정의 코드 삽입

Downloading...


Loading actual DrugBank and TWOSIDES data and labels from TDC...
Loading DrugBank dataset...


100%|██████████| 44.4M/44.4M [00:01<00:00, 40.5MiB/s]
Loading...
Done!
Downloading...


✓ DrugBank loaded: 86 interaction types
Loading TWOSIDES dataset...


100%|██████████| 677M/677M [00:23<00:00, 28.7MiB/s]
Loading...
Done!


✓ TWOSIDES loaded: 1317 side effect types

DrugBank Labels (first 10):
0: 1 -> #Drug1 may increase the photosensitizing activities of #Drug2.
1: 2 -> #Drug1 may increase the anticholinergic activities of #Drug2.
2: 3 -> The bioavailability of #Drug2 can be decreased when combined with #Drug1.
3: 4 -> The metabolism of #Drug2 can be increased when combined with #Drug1.
4: 5 -> #Drug1 may decrease the vasoconstricting activities of #Drug2.
5: 6 -> #Drug1 may increase the anticoagulant activities of #Drug2.
6: 7 -> #Drug1 may increase the ototoxic activities of #Drug2.
7: 8 -> The therapeutic efficacy of #Drug2 can be increased when used in combination with #Drug1.
8: 9 -> #Drug1 may increase the hypoglycemic activities of #Drug2.
9: 10 -> #Drug1 may increase the antihypertensive activities of #Drug2.

TWOSIDES Labels (first 10):
0: 1024 -> hypermagnesemia
1: 767 -> retinopathy of prematurity
2: 79 -> atelectasis
3: 25 -> alkalosis
4: 85 -> Back Ache
5: 735 -> lung edema
6: 959 -> agitate

##### food-drug interaction 정보 테이블 불러오기

In [None]:
df = pd.read_csv('duplicate_interaction_text.csv')

print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst 10 rows:")
print(df.head(10))

print("\nSample interaction texts:")
for i, text in enumerate(df['interaction_text'].head(5)):
    print(f"{i+1}. {text}")

Dataset Info:
Shape: (9682, 2)
Columns: ['interaction_text', 'duplicate_count']

First 10 rows:
                                    interaction_text  duplicate_count
0                                                NaN            10214
1   The potential adverse interactions included d...               64
2  The amino acid umbilical arteriovenous differe...               45
3   Pre-clinical and clinical studies showed that...               39
4   Urinary endogenous metabolites of trimethylam...               31
5   Heptanal, 3-methyl-2-butanone, dimethyl disul...               31
6   In FSGS patients, urinary levels of glucose, ...               28
7   For 9 nutrients, intake quintiles 4 or 5 (vs....               23
8   Patients receiving the high protein diet had ...               22
9   Compared with milk intake, cheese consumption...               21

Sample interaction texts:
1. nan
2.  The potential adverse interactions included decreased drug bioavailability (apple juice-fexofena

In [None]:
# 텍스트 전처리 (기본)
import re

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).strip()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

df['cleaned_text'] = df['interaction_text'].apply(preprocess_text)
df = df[df['cleaned_text'].str.len() > 0].reset_index(drop=True)

print(f"After preprocessing: {df.shape[0]} interactions")

After preprocessing: 9681 interactions


##### interaction 정보 분류하기

In [4]:
df = pd.read_csv('fdi20250627.csv')

print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst 10 rows:")
print(df.head(10))

print("\nSample interaction texts:")
for i, text in enumerate(df['interaction_text'].head(5)):
    print(f"{i+1}. {text}")

Dataset Info:
Shape: (66464, 19)
Columns: ['food', 'drug', 'interaction_text', 'drug_top1_similar_food', 'drug_top1_similar_food_score', 'drug_top1_similar_drug', 'drug_top1_similar_drug_score', 'food_top1_similar_food', 'food_top1_similar_food_score', 'food_top1_similar_drug', 'food_top1_similar_drug_score', 'food_real_type', 'food_score', 'drug_real_type', 'drug_score', 'food_valid', 'drug_valid', 'food_final_label', 'drug_final_label']

First 10 rows:
           food               drug  \
0  carbohydrate        abemaciclib   
1    grapefruit        abemaciclib   
2    grapefruit           fentanyl   
3    grapefruit           fentanyl   
4    grapefruit           fentanyl   
5    grapefruit          docetaxel   
6    grapefruit         paclitaxel   
7       ethanol  opioid analgesics   
8       ethanol   dextromethorphan   
9       calcium       tetracycline   

                                    interaction_text drug_top1_similar_food  \
0  a high-fat, high-calorie meal (800 to 10

In [7]:
# 텍스트 전처리 (기본)
import re

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).strip()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

df['cleaned_text'] = df['interaction_text'].apply(preprocess_text)
df = df[df['cleaned_text'].str.len() > 0].reset_index(drop=True)

print(f"After preprocessing: {df.shape[0]} interactions")

After preprocessing: 66464 interactions


#### **1. BioBERT 모델 (+ cosine similarity)**

In [None]:
# 모델 로드
from transformers import AutoTokenizer, AutoModel
import torch

# BioBERT 모델 및 토크나이저 로딩
biobert_model_name = "dmis-lab/biobert-base-cased-v1.1"
biobert_tokenizer = AutoTokenizer.from_pretrained(biobert_model_name)
biobert_model = AutoModel.from_pretrained(biobert_model_name)
biobert_model.to(device)
biobert_model.eval()

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

##### 텍스트 → BioBERT 임베딩 벡터 변환

In [None]:
@torch.no_grad()
def get_biobert_embedding(text):
    encoded = biobert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    output = biobert_model(input_ids=input_ids, attention_mask=attention_mask)
    # [CLS] token의 벡터 사용
    cls_embedding = output.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return cls_embedding

In [None]:
# interaction_text
df['embedding'] = df['cleaned_text'].apply(get_biobert_embedding)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
# DrugBank 라벨 설명 임베딩
drugbank_label_texts = list(drugbank_labels.values())
drugbank_label_embeddings = [get_biobert_embedding(label) for label in drugbank_label_texts]

# TWOSIDES 라벨 설명 임베딩
twosides_label_texts = list(twosides_labels.values())
twosides_label_embeddings = [get_biobert_embedding(label) for label in twosides_label_texts]

In [None]:
for label_text, emb in zip(drugbank_label_texts[:3], drugbank_label_embeddings[:3]):
    print("Label:", label_text)
    print("Embedding preview:", emb[:10])  # 앞 10개 값만 출력
    print()

Label: #Drug1 may increase the photosensitizing activities of #Drug2.
Embedding preview: [ 0.33447364 -0.22339256 -0.1443544   0.13961108 -0.7427634   0.23301537
  0.23236266  0.1978987   0.37149185 -0.16987966]

Label: #Drug1 may increase the anticholinergic activities of #Drug2.
Embedding preview: [ 0.34164205 -0.24816418  0.1090633   0.16471739 -0.7009473  -0.09101138
  0.23194544 -0.06688289  0.35020083 -0.0451881 ]

Label: The bioavailability of #Drug2 can be decreased when combined with #Drug1.
Embedding preview: [ 0.34516507 -0.19422856 -0.07730879  0.198312   -0.64462763  0.15240484
 -0.15785773  0.12342464  0.25853068 -0.18494326]



##### 유사도 기반 라벨 할당 (cosine similarity)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def assign_label(text_embedding, label_embeddings, label_names):
    sims = cosine_similarity([text_embedding], label_embeddings)[0]
    top_idx = np.argmax(sims)
    return label_names[top_idx], sims[top_idx]

# DrugBank 라벨 할당
df['drugbank_label'], df['drugbank_similarity'] = zip(*df['embedding'].apply(
    lambda emb: assign_label(emb, drugbank_label_embeddings, drugbank_label_texts)))

# TWOSIDES 라벨 할당
df['twosides_label'], df['twosides_similarity'] = zip(*df['embedding'].apply(
    lambda emb: assign_label(emb, twosides_label_embeddings, twosides_label_texts)))

In [None]:
## 예시 결과 출력 ##
print("\n=== DrugBank Label Assignment ===")
print(df[['interaction_text', 'drugbank_label', 'drugbank_similarity']].head())

print("\n=== TWOSIDES Label Assignment ===")
print(df[['interaction_text', 'twosides_label', 'twosides_similarity']].head())


=== DrugBank Label Assignment ===
                                    interaction_text  \
0   The potential adverse interactions included d...   
1  The amino acid umbilical arteriovenous differe...   
2   Pre-clinical and clinical studies showed that...   
3   Urinary endogenous metabolites of trimethylam...   
4   Heptanal, 3-methyl-2-butanone, dimethyl disul...   

                                      drugbank_label  drugbank_similarity  
0  #Drug1 can cause an increase in the absorption...             0.893428  
1  The risk or severity of hyperkalemia can be in...             0.864807  
2  #Drug1 can cause an increase in the absorption...             0.889673  
3  The serum concentration of the active metaboli...             0.895592  
4  #Drug1 can cause an increase in the absorption...             0.886825  

=== TWOSIDES Label Assignment ===
                                    interaction_text  \
0   The potential adverse interactions included d...   
1  The amino acid umbilic

In [None]:
df_to_save = df[['interaction_text',
                 'drugbank_label', 'drugbank_similarity',
                 'twosides_label', 'twosides_similarity']]

output_path = 'BioBERT_labeling.csv'
df_to_save.to_csv(output_path, index=False)

#### **2. Sentence-BERT 생의학 모델 (+cosine similarity)**

In [5]:
from sentence_transformers import SentenceTransformer
import torch
model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')

# 기존 함수를 이렇게만 수정
@torch.no_grad()
def get_sentence_embedding(text):
    return model.encode(text)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/388 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
df = pd.read_csv('fdi20250627.csv')
df['cleaned_text'] = df['interaction_text'].apply(preprocess_text)
df = df[df['cleaned_text'].str.len() > 0].reset_index(drop=True)

In [9]:
# interaction_text
df['embedding'] = df['cleaned_text'].apply(get_sentence_embedding)

In [10]:
# DrugBank 라벨 설명 임베딩
drugbank_label_texts = list(drugbank_labels.values())
drugbank_label_embeddings = [get_sentence_embedding(label) for label in drugbank_label_texts]

In [11]:
for label_text, emb in zip(drugbank_label_texts[:3], drugbank_label_embeddings[:3]):
    print("Label:", label_text)
    print("Embedding preview:", emb[:10])  # 앞 10개 값만 출력
    print()

Label: #Drug1 may increase the photosensitizing activities of #Drug2.
Embedding preview: [-0.5471126  -0.3281939  -0.16475488 -0.8327715  -0.0886354   0.06130435
 -0.59558153  0.5961103   0.8648376   0.03469231]

Label: #Drug1 may increase the anticholinergic activities of #Drug2.
Embedding preview: [-0.45517865 -0.15426844 -0.04830934 -0.81305027 -0.27434093  0.30267838
 -0.78655225  0.45710558  0.7148244   0.01056521]

Label: The bioavailability of #Drug2 can be decreased when combined with #Drug1.
Embedding preview: [-6.8829590e-01 -1.5069562e-01  2.9387219e-02 -7.9448247e-01
  8.2549185e-02 -3.6148063e-04 -6.0688841e-01  4.2983505e-01
  5.0721312e-01  1.0511621e-01]



In [12]:
from sklearn.metrics.pairwise import cosine_similarity

def assign_label(text_embedding, label_embeddings, label_names):
    sims = cosine_similarity([text_embedding], label_embeddings)[0]
    top_idx = np.argmax(sims)
    return label_names[top_idx], sims[top_idx]

# DrugBank 라벨 할당
df['interaction_label'], df['interaction_similarity'] = zip(*df['embedding'].apply(
    lambda emb: assign_label(emb, drugbank_label_embeddings, drugbank_label_texts)))

df['interaction_label_id'] = df['interaction_label'].map({label_text: label_id for label_id, label_text in drugbank_labels.items()})


In [13]:
## 예시 결과 출력 ##
print("\n=== DrugBank Label Assignment ===")
print(df.head())


=== DrugBank Label Assignment ===
           food         drug  \
0  carbohydrate  abemaciclib   
1    grapefruit  abemaciclib   
2    grapefruit     fentanyl   
3    grapefruit     fentanyl   
4    grapefruit     fentanyl   

                                    interaction_text drug_top1_similar_food  \
0  a high-fat, high-calorie meal (800 to 1000 cal...             ['Endive']   
1  patients should avoid consumption of grapefrui...             ['Endive']   
2  generally avoid:  consumption of grapefruit ju...               ['Teff']   
3  certain compounds present in grapefruit are kn...               ['Teff']   
4  due to a high degree of interpatient variabili...               ['Teff']   

   drug_top1_similar_food_score drug_top1_similar_drug  \
0                      0.510642        ['Abemaciclib']   
1                      0.510642        ['Abemaciclib']   
2                      0.495075           ['Fentanyl']   
3                      0.495075           ['Fentanyl']   
4      

In [17]:
df_to_save = df.drop(columns=['cleaned_text', 'embedding'])
output_path = 'fdi_labeling_final.csv'
df_to_save.to_csv(output_path, index=False)

#### **3. ClinicalBERT (+cosine similarity)**

In [None]:
# 모델 로드
from transformers import AutoTokenizer, AutoModel
import torch

# 모델 및 토크나이저 로딩
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)
model.eval()

@torch.no_grad()
def get_embedding(text):
    encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    output = model(input_ids=input_ids, attention_mask=attention_mask)
    # [CLS] token의 벡터 사용
    cls_embedding = output.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return cls_embedding

# interaction_text
df['embedding'] = df['cleaned_text'].apply(get_biobert_embedding)

# DrugBank 라벨 설명 임베딩
drugbank_label_texts = list(drugbank_labels.values())
drugbank_label_embeddings = [get_embedding(label) for label in drugbank_label_texts]

# TWOSIDES 라벨 설명 임베딩
twosides_label_texts = list(twosides_labels.values())
twosides_label_embeddings = [get_embedding(label) for label in twosides_label_texts]

for label_text, emb in zip(drugbank_label_texts[:3], drugbank_label_embeddings[:3]):
    print("Label:", label_text)
    print("Embedding preview:", emb[:10])  # 앞 10개 값만 출력
    print()

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Label: #Drug1 may increase the photosensitizing activities of #Drug2.
Embedding preview: [ 0.2607462   0.33125526 -0.26625833  0.25910857 -0.2790016  -0.0742676
  0.34899     0.31482977  0.59320915 -0.06175745]

Label: #Drug1 may increase the anticholinergic activities of #Drug2.
Embedding preview: [ 0.17830761  0.37681386 -0.1837425   0.28004545 -0.26192224 -0.2732479
  0.3436645   0.20051081  0.52868694 -0.10334631]

Label: The bioavailability of #Drug2 can be decreased when combined with #Drug1.
Embedding preview: [ 0.3713369   0.15402023 -0.25674215  0.28145298 -0.22235376 -0.14847584
  0.36200842  0.04496166  0.58611244  0.01631697]



In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def assign_label(text_embedding, label_embeddings, label_names):
    sims = cosine_similarity([text_embedding], label_embeddings)[0]
    top_idx = np.argmax(sims)
    return label_names[top_idx], sims[top_idx]

# DrugBank 라벨 할당
df['drugbank_label'], df['drugbank_similarity'] = zip(*df['embedding'].apply(
    lambda emb: assign_label(emb, drugbank_label_embeddings, drugbank_label_texts)))

# TWOSIDES 라벨 할당
df['twosides_label'], df['twosides_similarity'] = zip(*df['embedding'].apply(
    lambda emb: assign_label(emb, twosides_label_embeddings, twosides_label_texts)))

## 예시 결과 출력 ##
print("\n=== DrugBank Label Assignment ===")
print(df[['interaction_text', 'drugbank_label', 'drugbank_similarity']].head())

print("\n=== TWOSIDES Label Assignment ===")
print(df[['interaction_text', 'twosides_label', 'twosides_similarity']].head())

df_to_save = df[['interaction_text',
                 'drugbank_label', 'drugbank_similarity',
                 'twosides_label', 'twosides_similarity']]

output_path = 'ClinicalBERT_labeling.csv'
df_to_save.to_csv(output_path, index=False)


=== DrugBank Label Assignment ===
                                    interaction_text  \
0   The potential adverse interactions included d...   
1  The amino acid umbilical arteriovenous differe...   
2   Pre-clinical and clinical studies showed that...   
3   Urinary endogenous metabolites of trimethylam...   
4   Heptanal, 3-methyl-2-butanone, dimethyl disul...   

                                      drugbank_label  drugbank_similarity  
0  #Drug1 may increase the QTc-prolonging activit...             0.818306  
1  #Drug1 may increase the hyperkalemic activitie...             0.823723  
2  #Drug1 may increase the antihypertensive activ...             0.830057  
3  #Drug1 may increase the QTc-prolonging activit...             0.818823  
4  #Drug1 may increase the cardiotoxic activities...             0.812907  

=== TWOSIDES Label Assignment ===
                                    interaction_text           twosides_label  \
0   The potential adverse interactions included d...    

#### **4. PubMedBERT (+cosine similarity)**

In [None]:
# 모델 로드
from transformers import AutoTokenizer, AutoModel
import torch

# 모델 및 토크나이저 로딩
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)
model.eval()

@torch.no_grad()
def get_embedding(text):
    encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    output = model(input_ids=input_ids, attention_mask=attention_mask)
    # [CLS] token의 벡터 사용
    cls_embedding = output.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return cls_embedding

# interaction_text
df['embedding'] = df['cleaned_text'].apply(get_biobert_embedding)

# DrugBank 라벨 설명 임베딩
drugbank_label_texts = list(drugbank_labels.values())
drugbank_label_embeddings = [get_embedding(label) for label in drugbank_label_texts]

# TWOSIDES 라벨 설명 임베딩
twosides_label_texts = list(twosides_labels.values())
twosides_label_embeddings = [get_embedding(label) for label in twosides_label_texts]

for label_text, emb in zip(drugbank_label_texts[:3], drugbank_label_embeddings[:3]):
    print("Label:", label_text)
    print("Embedding preview:", emb[:10])  # 앞 10개 값만 출력
    print()

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Label: #Drug1 may increase the photosensitizing activities of #Drug2.
Embedding preview: [-0.3186272   0.21600634  0.2626788   0.07240072 -0.12932958  0.39236966
 -0.3744542   0.07788904  0.3333059   0.257255  ]

Label: #Drug1 may increase the anticholinergic activities of #Drug2.
Embedding preview: [-0.32251152  0.30217293  0.22749284  0.02609112 -0.015716    0.49862027
 -0.42958838 -0.09925223  0.29201743  0.01394986]

Label: The bioavailability of #Drug2 can be decreased when combined with #Drug1.
Embedding preview: [-0.21109666  0.10353931  0.16873378  0.04902624 -0.0933973   0.21762674
 -0.22808051 -0.04737537  0.09764458  0.03450186]



In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def assign_label(text_embedding, label_embeddings, label_names):
    sims = cosine_similarity([text_embedding], label_embeddings)[0]
    top_idx = np.argmax(sims)
    return label_names[top_idx], sims[top_idx]

# DrugBank 라벨 할당
df['drugbank_label'], df['drugbank_similarity'] = zip(*df['embedding'].apply(
    lambda emb: assign_label(emb, drugbank_label_embeddings, drugbank_label_texts)))

# TWOSIDES 라벨 할당
df['twosides_label'], df['twosides_similarity'] = zip(*df['embedding'].apply(
    lambda emb: assign_label(emb, twosides_label_embeddings, twosides_label_texts)))

## 예시 결과 출력 ##
print("\n=== DrugBank Label Assignment ===")
print(df[['interaction_text', 'drugbank_label', 'drugbank_similarity']].head())

print("\n=== TWOSIDES Label Assignment ===")
print(df[['interaction_text', 'twosides_label', 'twosides_similarity']].head())

df_to_save = df[['interaction_text',
                 'drugbank_label', 'drugbank_similarity',
                 'twosides_label', 'twosides_similarity']]

output_path = 'PubMedBERT_labeling.csv'
df_to_save.to_csv(output_path, index=False)


=== DrugBank Label Assignment ===
                                    interaction_text  \
0   The potential adverse interactions included d...   
1  The amino acid umbilical arteriovenous differe...   
2   Pre-clinical and clinical studies showed that...   
3   Urinary endogenous metabolites of trimethylam...   
4   Heptanal, 3-methyl-2-butanone, dimethyl disul...   

                                      drugbank_label  drugbank_similarity  
0  #Drug1 may increase the nephrotoxic activities...            -0.004716  
1  #Drug1 may increase the nephrotoxic activities...             0.034324  
2  #Drug1 may increase the nephrotoxic activities...             0.013782  
3  #Drug1 may increase the nephrotoxic activities...             0.012956  
4  #Drug1 may increase the nephrotoxic activities...             0.028282  

=== TWOSIDES Label Assignment ===
                                    interaction_text        twosides_label  \
0   The potential adverse interactions included d...       

#### **5. BlueBERT (+cosine similarity)**

In [None]:
# 모델 로드
from transformers import AutoTokenizer, AutoModel
import torch

# 모델 및 토크나이저 로딩
model_name = "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)
model.eval()

@torch.no_grad()
def get_embedding(text):
    encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    output = model(input_ids=input_ids, attention_mask=attention_mask)
    # [CLS] token의 벡터 사용
    cls_embedding = output.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return cls_embedding

# interaction_text
df['embedding'] = df['cleaned_text'].apply(get_biobert_embedding)

# DrugBank 라벨 설명 임베딩
drugbank_label_texts = list(drugbank_labels.values())
drugbank_label_embeddings = [get_embedding(label) for label in drugbank_label_texts]

# TWOSIDES 라벨 설명 임베딩
twosides_label_texts = list(twosides_labels.values())
twosides_label_embeddings = [get_embedding(label) for label in twosides_label_texts]

for label_text, emb in zip(drugbank_label_texts[:3], drugbank_label_embeddings[:3]):
    print("Label:", label_text)
    print("Embedding preview:", emb[:10])  # 앞 10개 값만 출력
    print()

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/441M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Label: #Drug1 may increase the photosensitizing activities of #Drug2.
Embedding preview: [ 0.07619784  0.0434857   0.10484701 -0.05733407 -0.20592695 -0.1261569
  0.09511004 -0.12450962  0.25185153 -0.0672551 ]

Label: #Drug1 may increase the anticholinergic activities of #Drug2.
Embedding preview: [ 0.1127034   0.01084386  0.07241394 -0.06721445 -0.14801283 -0.135088
  0.15414937  0.10174806  0.33611625 -0.03462356]

Label: The bioavailability of #Drug2 can be decreased when combined with #Drug1.
Embedding preview: [ 0.07064604  0.02008527  0.04434816  0.13209298  0.06606337  0.22357737
  0.06452335 -0.03771364  0.23963027 -0.1461406 ]



In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def assign_label(text_embedding, label_embeddings, label_names):
    sims = cosine_similarity([text_embedding], label_embeddings)[0]
    top_idx = np.argmax(sims)
    return label_names[top_idx], sims[top_idx]

# DrugBank 라벨 할당
df['drugbank_label'], df['drugbank_similarity'] = zip(*df['embedding'].apply(
    lambda emb: assign_label(emb, drugbank_label_embeddings, drugbank_label_texts)))

# TWOSIDES 라벨 할당
df['twosides_label'], df['twosides_similarity'] = zip(*df['embedding'].apply(
    lambda emb: assign_label(emb, twosides_label_embeddings, twosides_label_texts)))

## 예시 결과 출력 ##
print("\n=== DrugBank Label Assignment ===")
print(df[['interaction_text', 'drugbank_label', 'drugbank_similarity']].head())

print("\n=== TWOSIDES Label Assignment ===")
print(df[['interaction_text', 'twosides_label', 'twosides_similarity']].head())

df_to_save = df[['interaction_text',
                 'drugbank_label', 'drugbank_similarity',
                 'twosides_label', 'twosides_similarity']]

output_path = 'BlueBERT_labeling.csv'
df_to_save.to_csv(output_path, index=False)


=== DrugBank Label Assignment ===
                                    interaction_text  \
0   The potential adverse interactions included d...   
1  The amino acid umbilical arteriovenous differe...   
2   Pre-clinical and clinical studies showed that...   
3   Urinary endogenous metabolites of trimethylam...   
4   Heptanal, 3-methyl-2-butanone, dimethyl disul...   

                                      drugbank_label  drugbank_similarity  
0  The serum concentration of the active metaboli...             0.019123  
1  #Drug1 may increase the respiratory depressant...             0.011852  
2  #Drug1 may increase the respiratory depressant...             0.014564  
3  #Drug1 may increase the respiratory depressant...             0.009273  
4  #Drug1 may increase the respiratory depressant...             0.019189  

=== TWOSIDES Label Assignment ===
                                    interaction_text      twosides_label  \
0   The potential adverse interactions included d...     brea

In [None]:
import pandas as pd

# CSV 파일 불러오기
df = pd.read_csv('SentenceBERT_labeling.csv')

# 'drugbank_similarity' 컬럼에서 0.90 이상인 값들의 개수 세기
count_high_similarity = (df['drugbank_similarity'] >= 0.85).sum()

# 결과 출력
print(f"drugbank_similarity 값이 0.85 이상인 행의 개수: {count_high_similarity}")

drugbank_similarity 값이 0.85 이상인 행의 개수: 9675


In [None]:
df.shape[0]

9681