In [1]:
import torch
import pandas as pd
import numpy as np
import glob
import os
import gc
from torch_geometric.data import HeteroData
from sklearn.preprocessing import LabelEncoder

# --- C·∫§U H√åNH ---
input_dir = r"C:\Users\dotie\OneDrive\Documents\FAERS\Processed_Chunks"
all_files = glob.glob(os.path.join(input_dir, "processed_final_*.csv"))

# --- 1. X√ÇY D·ª∞NG T·ª™ ƒêI·ªÇN ID (MAPPING) ---
# ƒê·ªÉ ti·∫øt ki·ªám RAM, ta ch·ªâ qu√©t t√™n c·ªôt (v·ªën ch·ª©a t√™n thu·ªëc/b·ªánh) ch·ª© kh√¥ng ƒë·ªçc to√†n b·ªô d·ªØ li·ªáu
print("--- B∆∞·ªõc 1: Qu√©t t√™n thu·ªëc v√† ph·∫£n ·ª©ng ƒë·ªÉ t·∫°o ID ---")

# ƒê·ªçc file ƒë·∫ßu ti√™n ƒë·ªÉ l·∫•y danh s√°ch c·ªôt (V√¨ c√°c file ƒë√£ ƒë·ªìng b·ªô c·ªôt)
df_cols = pd.read_csv(all_files[0], nrows=0) 
drug_names = [c for c in df_cols.columns if c.startswith('x ') and c not in ['x Age', 'x Gender', 'x AgeGroup']]
reaction_names = [c for c in df_cols.columns if c.startswith('y ')]

print(f"S·ªë l∆∞·ª£ng thu·ªëc: {len(drug_names)}")
print(f"S·ªë l∆∞·ª£ng ph·∫£n ·ª©ng: {len(reaction_names)}")

# --- 2. X√ÇY D·ª∞NG DANH S√ÅCH C·∫†NH (EDGE LIST) ---
print("--- B∆∞·ªõc 2: ƒê·ªçc d·ªØ li·ªáu v√† t·∫°o danh s√°ch c·∫°nh ---")

# Kh·ªüi t·∫°o list ch·ª©a c·∫°nh
src_patient_drug = []
dst_patient_drug = []
src_patient_react = []
dst_patient_react = []

patient_features_list = []
global_patient_id = 0 # ID b·ªánh nh√¢n tƒÉng d·∫ßn

for f in all_files:
    # ƒê·ªçc t·ª´ng chunk
    print(f"Processing: {os.path.basename(f)}")
    df = pd.read_csv(f)
    
    num_rows = len(df)
    local_patient_ids = np.arange(num_rows) + global_patient_id
    
    # 1. L·∫•y ƒë·∫∑c tr∆∞ng b·ªánh nh√¢n (Age, Gender)
    # Normalize Age: chia 100
    p_feats = df[['x Age', 'x Gender']].values
    p_feats[:, 0] = p_feats[:, 0] / 100.0 
    patient_features_list.append(p_feats)
    
    # 2. T·∫°o c·∫°nh Patient -> Drug
    # L·∫•y ma tr·∫≠n con c·ªßa thu·ªëc (d·∫°ng dense)
    # L∆∞u √Ω: N·∫øu m√°y y·∫øu, ƒëo·∫°n n√†y c√≥ th·ªÉ l√†m t·ª´ng d√≤ng, nh∆∞ng numpy vectorization nhanh h∆°n
    drug_matrix = df[drug_names].values
    # T√¨m c√°c v·ªã tr√≠ thu·ªëc > 0
    row_indices, col_indices = np.where(drug_matrix > 0)
    
    # Map row index local -> global patient ID
    src_patient_drug.append(local_patient_ids[row_indices])
    dst_patient_drug.append(col_indices) # col_indices ch√≠nh l√† ID c·ªßa thu·ªëc (do th·ª© t·ª± c·ªôt c·ªë ƒë·ªãnh)
    
    # 3. T·∫°o c·∫°nh Patient -> Reaction
    react_matrix = df[reaction_names].values
    row_indices_r, col_indices_r = np.where(react_matrix > 0)
    
    src_patient_react.append(local_patient_ids[row_indices_r])
    dst_patient_react.append(col_indices_r)
    
    global_patient_id += num_rows
    
    # D·ªçn RAM
    del df, drug_matrix, react_matrix
    gc.collect()

# G·ªôp c√°c list l·∫°i th√†nh numpy array l·ªõn
print("ƒêang gh√©p n·ªëi d·ªØ li·ªáu...")
patient_features = np.vstack(patient_features_list)
edge_index_drug = np.vstack([np.concatenate(src_patient_drug), np.concatenate(dst_patient_drug)])
edge_index_react = np.vstack([np.concatenate(src_patient_react), np.concatenate(dst_patient_react)])

# --- 3. T·∫†O HETERODATA ---
print("--- B∆∞·ªõc 3: T·∫°o Graph Object ---")
data = HeteroData()

# Nodes
data['patient'].x = torch.from_numpy(patient_features).float()
data['patient'].num_nodes = global_patient_id
# Drug & Reaction kh√¥ng c·∫ßn features input, ta s·∫Ω d√πng Embedding Layer trong model
data['drug'].num_nodes = len(drug_names)
data['reaction'].num_nodes = len(reaction_names)

# Edges (Chuy·ªÉn sang LongTensor)
data['patient', 'takes', 'drug'].edge_index = torch.from_numpy(edge_index_drug).long()
data['patient', 'has_reaction', 'reaction'].edge_index = torch.from_numpy(edge_index_react).long()

# T·∫°o c·∫°nh ng∆∞·ª£c (Reverse edges) cho GNN
data['drug', 'taken_by', 'patient'].edge_index = torch.flip(data['patient', 'takes', 'drug'].edge_index, [0])
data['reaction', 'reaction_in', 'patient'].edge_index = torch.flip(data['patient', 'has_reaction', 'reaction'].edge_index, [0])

print("Th√¥ng tin ƒë·ªì th·ªã:", data)

# L∆∞u xu·ªëng ·ªï c·ª©ng ƒë·ªÉ d√πng cho file train
torch.save(data, "hetero_graph_data.pt")
print("‚úÖ ƒê√£ l∆∞u file 'hetero_graph_data.pt'. Xong ph·∫ßn chu·∫©n b·ªã!")

  from .autonotebook import tqdm as notebook_tqdm


--- B∆∞·ªõc 1: Qu√©t t√™n thu·ªëc v√† ph·∫£n ·ª©ng ƒë·ªÉ t·∫°o ID ---
S·ªë l∆∞·ª£ng thu·ªëc: 5259
S·ªë l∆∞·ª£ng ph·∫£n ·ª©ng: 10488
--- B∆∞·ªõc 2: ƒê·ªçc d·ªØ li·ªáu v√† t·∫°o danh s√°ch c·∫°nh ---
Processing: processed_final_1.csv
Processing: processed_final_10.csv
Processing: processed_final_100.csv
Processing: processed_final_101.csv
Processing: processed_final_102.csv
Processing: processed_final_103.csv
Processing: processed_final_104.csv
Processing: processed_final_105.csv
Processing: processed_final_106.csv
Processing: processed_final_107.csv
Processing: processed_final_108.csv
Processing: processed_final_109.csv
Processing: processed_final_11.csv
Processing: processed_final_110.csv
Processing: processed_final_111.csv
Processing: processed_final_112.csv
Processing: processed_final_113.csv
Processing: processed_final_114.csv
Processing: processed_final_115.csv
Processing: processed_final_116.csv
Processing: processed_final_117.csv
Processing: processed_final_118.csv
Processing: pr

In [1]:
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
import glob
import os
import gc
from torch_geometric.data import HeteroData
from torch_geometric.nn import SAGEConv, to_hetero, Linear

# ---------------------------------------------------------
# 1. C·∫§U H√åNH & CHU·∫®N B·ªä
# ---------------------------------------------------------
input_dir = r"C:\Users\dotie\OneDrive\Documents\FAERS\Processed_Chunks"
all_files = glob.glob(os.path.join(input_dir, "processed_final_*.csv"))

print(f"T√¨m th·∫•y {len(all_files)} file d·ªØ li·ªáu.")

# ƒê·ªçc file ƒë·∫ßu ti√™n ƒë·ªÉ l·∫•y th√¥ng tin k√≠ch th∆∞·ªõc (S·ªë l∆∞·ª£ng thu·ªëc/ph·∫£n ·ª©ng to√†n c·ª•c)
df_temp = pd.read_csv(all_files[0], nrows=1)
drug_cols = [c for c in df_temp.columns if c.startswith('x ') and c not in ['x Age', 'x Gender', 'x AgeGroup']]
reaction_cols = [c for c in df_temp.columns if c.startswith('y ')]
num_drugs = len(drug_cols)
num_reactions = len(reaction_cols)

print(f"S·ªë l∆∞·ª£ng thu·ªëc (Input Nodes): {num_drugs}")
print(f"S·ªë l∆∞·ª£ng ph·∫£n ·ª©ng (Output Nodes): {num_reactions}")

# ---------------------------------------------------------
# 2. ƒê·ªäNH NGHƒ®A MODEL
# ---------------------------------------------------------
class HeteroSage(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, num_layers=2):
        super().__init__()
        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = SAGEConv((-1, -1), hidden_channels)
            self.convs.append(conv)

    def forward(self, x, edge_index):
        for conv in self.convs:
            x = conv(x, edge_index).relu()
        return x

# Metadata gi·∫£ l·∫≠p ƒë·ªÉ kh·ªüi t·∫°o model (c·∫•u tr√∫c ƒë·ªì th·ªã)
# Ta c·∫ßn khai b√°o c√°c lo·∫°i node v√† edge s·∫Ω xu·∫•t hi·ªán
metadata = (
    ['patient', 'drug', 'reaction'],
    [('patient', 'takes', 'drug'), 
     ('drug', 'taken_by', 'patient'),
     ('patient', 'has_reaction', 'reaction'),
     ('reaction', 'reaction_in', 'patient')]
)

model = HeteroSage(hidden_channels=64, out_channels=64, num_layers=2)
model = to_hetero(model, metadata, aggr='sum')

# Embedding layers cho Drug v√† Reaction
# +1 ƒë·ªÉ d·ª± ph√≤ng index
drug_emb = torch.nn.Embedding(num_drugs + 1, 64)
react_emb = torch.nn.Embedding(num_reactions + 1, 64)
# Linear layer cho Patient (Age, Gender)
patient_lin = Linear(2, 64)

optimizer = torch.optim.Adam(
    list(model.parameters()) + list(drug_emb.parameters()) + list(react_emb.parameters()) + list(patient_lin.parameters()), 
    lr=0.01
)

# ---------------------------------------------------------
# 3. H√ÄM CHUY·ªÇN ƒê·ªîI DATAFRAME -> GRAPH (Mini-Graph)
# ---------------------------------------------------------
def create_mini_graph(df):
    # 1. Patient Features
    p_feats = df[['x Age', 'x Gender']].values
    p_feats[:, 0] = p_feats[:, 0] / 100.0 
    x_patient = torch.tensor(p_feats, dtype=torch.float)
    
    # 2. Edges: Patient -> Drug
    # D√πng numpy where ƒë·ªÉ t√¨m c√°c √¥ c√≥ gi√° tr·ªã > 0 (ng∆∞·ªùi d√πng thu·ªëc)
    # drug_cols ƒë√£ ƒë∆∞·ª£c sort v√† c·ªë ƒë·ªãnh v·ªã tr√≠ ·ªü b∆∞·ªõc chu·∫©n b·ªã
    drug_matrix = df[drug_cols].values
    rows, cols = np.where(drug_matrix > 0)
    edge_index_drug = torch.tensor([rows, cols], dtype=torch.long)
    
    # 3. Edges: Patient -> Reaction
    react_matrix = df[reaction_cols].values
    rows_r, cols_r = np.where(react_matrix > 0)
    edge_index_react = torch.tensor([rows_r, cols_r], dtype=torch.long)
    
    # T·∫°o HeteroData
    data = HeteroData()
    data['patient'].x = x_patient
    data['patient'].num_nodes = len(df)
    
    # G√°n Embedding (ID) cho thu·ªëc/ph·∫£n ·ª©ng
    # ·ªû ƒë√¢y ta truy·ªÅn v√†o indices t·ª´ 0 -> num_drugs ƒë·ªÉ Embedding layer tra c·ª©u
    data['drug'].x = torch.arange(num_drugs)
    data['reaction'].x = torch.arange(num_reactions)
    
    data['patient', 'takes', 'drug'].edge_index = edge_index_drug
    data['patient', 'has_reaction', 'reaction'].edge_index = edge_index_react
    
    # C·∫°nh ng∆∞·ª£c
    data['drug', 'taken_by', 'patient'].edge_index = torch.flip(edge_index_drug, [0])
    data['reaction', 'reaction_in', 'patient'].edge_index = torch.flip(edge_index_react, [0])
    
    return data

# ---------------------------------------------------------
# 4. TRAINING LOOP (Chunk-based)
# ---------------------------------------------------------
print("B·∫Øt ƒë·∫ßu training (File-by-File)...")

def train_one_epoch():
    model.train()
    total_loss = 0
    total_examples = 0
    
    # Bi·∫øn ƒë·ªÉ t√≠nh metrics to√†n c·ª•c cho Epoch
    total_tp = 0 # True Positives
    total_fp = 0 # False Positives
    total_fn = 0 # False Negatives
    
    # Duy·ªát qua t·ª´ng file CSV
    for i, f_path in enumerate(all_files):
        try:
            df_chunk = pd.read_csv(f_path)
        except:
            continue
            
        batch = create_mini_graph(df_chunk)
        optimizer.zero_grad()
        
        # --- Forward Pass ---
        x_dict = {
            'patient': patient_lin(batch['patient'].x),
            'drug': drug_emb(batch['drug'].x),
            'reaction': react_emb(batch['reaction'].x)
        }
        
        out = model(x_dict, batch.edge_index_dict)
        
        edge_index = batch['patient', 'has_reaction', 'reaction'].edge_index
        if edge_index.numel() == 0: continue

        src_emb = out['patient'][edge_index[0]]
        dst_emb = out['reaction'][edge_index[1]]
        
        # --- T√≠nh ƒëi·ªÉm (Logits) ---
        pos_score = (src_emb * dst_emb).sum(dim=-1) # Logits cho m·∫´u D∆∞∆°ng (C√≥ th·∫≠t)
        
        neg_dst_idx = torch.randint(0, num_reactions, (len(pos_score),))
        neg_dst_emb = out['reaction'][neg_dst_idx]
        neg_score = (src_emb * neg_dst_emb).sum(dim=-1) # Logits cho m·∫´u √Çm (Gi·∫£)
        
        # --- T√≠nh Loss ---
        scores = torch.cat([pos_score, neg_score])
        labels = torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
        
        loss = F.binary_cross_entropy_with_logits(scores, labels)
        loss.backward()
        optimizer.step()
        
        # --- T√çNH TO√ÅN METRICS CHO BATCH N√ÄY ---
        # Ng∆∞·ª°ng ph√¢n lo·∫°i l√† 0 (v√¨ Sigmoid(0) = 0.5)
        # S·ª≠ d·ª•ng torch.no_grad() ƒë·ªÉ kh√¥ng ·∫£nh h∆∞·ªüng ƒë·∫øn gradient
        with torch.no_grad():
            # TP: M·∫´u d∆∞∆°ng c√≥ ƒëi·ªÉm d·ª± ƒëo√°n > 0
            tp = (pos_score > 0).sum().item()
            # FN: M·∫´u d∆∞∆°ng c√≥ ƒëi·ªÉm d·ª± ƒëo√°n <= 0
            fn = (pos_score <= 0).sum().item()
            # FP: M·∫´u √¢m c√≥ ƒëi·ªÉm d·ª± ƒëo√°n > 0
            fp = (neg_score > 0).sum().item()
            
            total_tp += tp
            total_fp += fp
            total_fn += fn
        
        batch_size = len(df_chunk)
        total_loss += loss.item() * batch_size
        total_examples += batch_size
        
        del df_chunk, batch
        
        if i % 10 == 0:
            print(f"  Processed {i}/{len(all_files)} files. Loss: {loss.item():.4f}")

    # --- T√çNH METRICS T·ªîNG H·ª¢P CU·ªêI EPOCH ---
    epsilon = 1e-9 # S·ªë nh·ªè ƒë·ªÉ tr√°nh l·ªói chia cho 0
    
    # 1. Precision = TP / (TP + FP)
    # (Trong s·ªë nh·ªØng c√°i m√°y ƒëo√°n l√† c√≥ ph·∫£n ·ª©ng, bao nhi√™u % l√† ƒë√∫ng?)
    precision = total_tp / (total_tp + total_fp + epsilon)
    
    # 2. Recall = TP / (TP + FN)
    # (Trong s·ªë c√°c ph·∫£n ·ª©ng th·ª±c t·∫ø, m√°y b·∫Øt ƒë∆∞·ª£c bao nhi√™u %?)
    recall = total_tp / (total_tp + total_fn + epsilon)
    
    # 3. F1-Score = 2 * (P * R) / (P + R)
    f1 = 2 * (precision * recall) / (precision + recall + epsilon)
    
    avg_loss = total_loss / (total_examples + epsilon)
    
    return avg_loss, precision, recall, f1

    # Ch·∫°y Training
for epoch in range(1, 2): # V√≠ d·ª• ch·∫°y 10 Epochs
    print(f"\n================ EPOCH {epoch} ================")
    
    # G·ªçi h√†m train v√† nh·∫≠n v·ªÅ 4 gi√° tr·ªã
    avg_loss, prec, rec, f1 = train_one_epoch()
    
    print(f"‚úÖ K·∫æT TH√öC EPOCH {epoch}")
    print(f"   - Average Loss: {avg_loss:.4f}")
    print(f"   - Precision:    {prec:.4f}")
    print(f"   - Recall:       {rec:.4f}")
    print(f"   - F1-Score:     {f1:.4f}")

print("\nüéâ ƒê√£ train xong! (Model saved)")

# ---------------------------------------------------------
# 5. L∆ØU MODEL & EMBEDDINGS
# ---------------------------------------------------------
torch.save(model.state_dict(), "HGNN_Model.pth")
torch.save(drug_emb.state_dict(), "Drug_Embeddings.pth")
torch.save(react_emb.state_dict(), "Reaction_Embeddings.pth")
torch.save(patient_lin.state_dict(), "Patient_Encoder.pth")
# L∆∞u danh s√°ch t√™n thu·ªëc/ph·∫£n ·ª©ng ƒë·ªÉ map l·∫°i sau n√†y
import joblib
joblib.dump(drug_cols, "Drug_Names.pkl")
joblib.dump(reaction_cols, "Reaction_Names.pkl")

  from .autonotebook import tqdm as notebook_tqdm


T√¨m th·∫•y 133 file d·ªØ li·ªáu.
S·ªë l∆∞·ª£ng thu·ªëc (Input Nodes): 5259
S·ªë l∆∞·ª£ng ph·∫£n ·ª©ng (Output Nodes): 10488
B·∫Øt ƒë·∫ßu training (File-by-File)...



  edge_index_drug = torch.tensor([rows, cols], dtype=torch.long)


  Processed 0/133 files. Loss: 0.8604
  Processed 10/133 files. Loss: 0.4947
  Processed 20/133 files. Loss: 0.4974
  Processed 30/133 files. Loss: 0.5369
  Processed 40/133 files. Loss: 0.4246
  Processed 50/133 files. Loss: 0.4147
  Processed 60/133 files. Loss: 0.3669
  Processed 70/133 files. Loss: 0.4295
  Processed 80/133 files. Loss: 0.4584
  Processed 90/133 files. Loss: 0.4431
  Processed 100/133 files. Loss: 0.3799
  Processed 110/133 files. Loss: 0.4427
  Processed 120/133 files. Loss: 0.4152
  Processed 130/133 files. Loss: 0.4122
‚úÖ K·∫æT TH√öC EPOCH 1
   - Average Loss: 0.4446
   - Precision:    0.9187
   - Recall:       0.9732
   - F1-Score:     0.9451

üéâ ƒê√£ train xong! (Model saved)


['Reaction_Names.pkl']