In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric_temporal.nn import TGCN
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import dask.dataframe as dd
from pathlib import Path
from tqdm import tqdm
import gc
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from torch.cuda.amp import autocast, GradScaler
import networkx as nx
from torch_geometric.utils import to_networkx
from sklearn.metrics import roc_curve, auc, roc_auc_score # ROC-AUC için gerekli kütüphaneler

In [3]:
# Set device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print(f"Using CUDA device: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available, using CPU")

Using CUDA device: NVIDIA GeForce GTX 1050 Ti


In [4]:
# Define data paths
edges_train_path = "edges_train_A.csv"
node_features_path = "node_features_sampled.csv"
edge_type_features_path = "edge_type_features.csv"
input_a_path = "input_A_initial.csv"

In [5]:
# --- Optimized Data Loading with Dask ---
print("Loading edges_train_df with Dask...")
edges_train_ddf = dd.read_csv(
    edges_train_path,
    header=None,
    skiprows=1,
    sep="\t",
    blocksize="16MB",
    assume_missing=True,
)

Loading edges_train_df with Dask...


In [6]:
# --- Create Node Mapping from Training Data Only ---
print("\nCreating node mapping from training data...")
train_nodes = pd.concat([
    edges_train_ddf[0].str.split(",", n=3, expand=True)[0].compute().astype("int32"),
    edges_train_ddf[0].str.split(",", n=3, expand=True)[1].compute().astype("int32")
]).unique()
num_nodes = len(train_nodes)
print(f"Total number of unique nodes in training data: {num_nodes}")

node_mapping = {node_id: idx for idx, node_id in enumerate(train_nodes)}


Creating node mapping from training data...
Total number of unique nodes in training data: 19442


In [7]:
def process_edges_partition(partition):
    try:
        split_df = partition.iloc[:, 0].str.split(",", n=3, expand=True)
        if split_df.shape[1] == 4:
            split_df.columns = ["src_id", "dst_id", "edge_type", "timestamp"]
            for col in ["src_id", "dst_id", "edge_type", "timestamp"]:
                split_df[col] = pd.to_numeric(split_df[col], errors="coerce").fillna(-1).astype("int32")
            split_df["src_id"] = split_df["src_id"].map(node_mapping).fillna(-1).astype("int32")
            split_df["dst_id"] = split_df["dst_id"].map(node_mapping).fillna(-1).astype("int32")
            split_df = split_df[
                (split_df["src_id"] >= 0) & 
                (split_df["dst_id"] >= 0) & 
                (split_df["src_id"] < num_nodes) & 
                (split_df["dst_id"] < num_nodes)
            ]
            if split_df.empty:
                print("Warning: Partition is empty after filtering.")
            return split_df
        else:
            print(f"Warning: Partition has {split_df.shape[1]} columns, expected 4.")
            return pd.DataFrame(columns=["src_id", "dst_id", "edge_type", "timestamp"], dtype="int32")
    except Exception as e:
        print(f"Error processing partition: {e}")
        return pd.DataFrame(columns=["src_id", "dst_id", "edge_type", "timestamp"], dtype="int32")

In [8]:
meta_edges = {"src_id": "int32", "dst_id": "int32", "edge_type": "int32", "timestamp": "int32"}
edges_train_ddf = edges_train_ddf.map_partitions(process_edges_partition, meta=meta_edges)
edges_train_ddf = edges_train_ddf[["src_id", "dst_id", "edge_type", "timestamp"]].persist()
print("edges_train_ddf partitions:", edges_train_ddf.npartitions)

edges_train_ddf partitions: 44


In [9]:
max_src = edges_train_ddf["src_id"].max().compute()
max_dst = edges_train_ddf["dst_id"].max().compute()
print(f"After mapping - Max src_id (train): {max_src}, Max dst_id (train): {max_dst}")
gc.collect()

After mapping - Max src_id (train): 19439, Max dst_id (train): 19441


311

In [10]:
# --- Load Node Features in Chunks ---
print("\nLoading node_features_df with Pandas (chunked)...")
def load_node_features_chunked(file_path, chunksize=10000):
    node_features_list = []
    for chunk in pd.read_csv(file_path, header=None, skiprows=1, sep="\t", chunksize=chunksize):
        split_df = chunk.iloc[:, 0].str.split(",", expand=True)
        split_df.columns = ["node_id"] + [f"feature_{i}" for i in range(split_df.shape[1] - 1)]
        split_df["node_id"] = pd.to_numeric(split_df["node_id"], errors="coerce").fillna(-1).astype("int32")
        for col in split_df.columns[1:]:
            split_df[col] = pd.to_numeric(split_df[col], errors="coerce").fillna(0).astype("float32")
        node_features_list.append(split_df)
        gc.collect()
    return pd.concat(node_features_list, axis=0)


Loading node_features_df with Pandas (chunked)...


In [11]:
node_features_df = load_node_features_chunked(node_features_path)
print("node_features_df shape:", node_features_df.shape)

node_features_df shape: (19441, 9)


In [12]:
node_features_df["node_id"] = node_features_df["node_id"].map(node_mapping).fillna(-1).astype("int32")
node_features_df = node_features_df[node_features_df["node_id"] >= 0]
print("node_features_df shape after mapping:", node_features_df.shape)

node_features_df shape after mapping: (19441, 9)


In [13]:
node_features_tensor = torch.zeros((num_nodes, node_features_df.shape[1] - 1), dtype=torch.float32)
known_nodes = node_features_df["node_id"].dropna().astype(int)
node_features_values = node_features_df.drop(columns=["node_id"]).values
node_features_values = (node_features_values - node_features_values.mean(axis=0)) / (node_features_values.std(axis=0) + 1e-8)
for idx, node_id in enumerate(known_nodes):
    if 0 <= node_id < num_nodes:
        node_features_tensor[node_id] = torch.tensor(node_features_values[idx], dtype=torch.float32)
print(f"Node features tensor shape: {node_features_tensor.shape}")
gc.collect()

Node features tensor shape: torch.Size([19442, 8])


0

In [14]:
# --- Load Edge Type Features in Chunks ---
print("\nLoading edge_type_features_df with Pandas (chunked)...")
def load_edge_type_features_chunked(file_path, chunksize=10000):
    edge_type_features_list = []
    for chunk in pd.read_csv(file_path, header=None, skiprows=1, sep="\t", chunksize=chunksize):
        split_df = chunk.iloc[:, 0].str.split(",", expand=True)
        split_df.columns = ["edge_type"] + [f"feature_{i}" for i in range(split_df.shape[1] - 1)]
        split_df["edge_type"] = pd.to_numeric(split_df["edge_type"], errors="coerce").fillna(-1).astype("int32")
        for col in split_df.columns[1:]:
            split_df[col] = pd.to_numeric(split_df[col], errors="coerce").fillna(0).astype("float32")
        edge_type_features_list.append(split_df)
        gc.collect()
    return pd.concat(edge_type_features_list, axis=0)


Loading edge_type_features_df with Pandas (chunked)...


In [15]:
edge_type_features_df = load_edge_type_features_chunked(edge_type_features_path)
print("edge_type_features_df shape:", edge_type_features_df.shape)
edge_features_values = edge_type_features_df.drop(columns=["edge_type"]).values
edge_features_values = (edge_features_values - edge_features_values.mean(axis=0)) / (edge_features_values.std(axis=0) + 1e-8)
edge_type_features_df.iloc[:, 1:] = edge_features_values
gc.collect()

edge_type_features_df shape: (247, 4)


0

In [16]:
# --- Load Input Data ---
print("\nLoading input_a_df with Pandas...")
def process_input_a(df):
    split_df = df.iloc[:, 0].str.split(",", expand=True)
    split_df.columns = ["src_id", "dst_id", "edge_type", "start_time", "end_time", "label"]  # 6 sütun
    for col in split_df.columns:
        split_df[col] = pd.to_numeric(split_df[col], errors="coerce").fillna(-1).astype("int32")
    return split_df


Loading input_a_df with Pandas...


In [17]:
input_a_df = pd.read_csv(input_a_path, header=None, skiprows=1, sep="\t")
input_a_df = process_input_a(input_a_df)
print("input_a_df shape:", input_a_df.shape)
input_a_df["src_id"] = input_a_df["src_id"].map(node_mapping).fillna(-1).astype("int32")
input_a_df["dst_id"] = input_a_df["dst_id"].map(node_mapping).fillna(-1).astype("int32")
gc.collect()

input_a_df shape: (8196, 6)


22

In [18]:
# --- Temporal Graph Creation ---
def create_temporal_graph_dask(edges_ddf, node_features_tensor, edge_type_features_df):
    temporal_data = []
    edge_type_features_tensor = torch.tensor(
        edge_type_features_df.drop(columns=["edge_type"]).values, dtype=torch.float32
    ).to(device)
    edge_type_to_idx = {et: idx for idx, et in enumerate(edge_type_features_df["edge_type"])}

    timestamp_groups = edges_ddf.groupby("timestamp").size().compute().index.sort_values()
    sampled_timestamps = timestamp_groups[::200]  # 10'da bir örnekleme, yaklaşık 2251 snapshot
    print(f"Total timestamps: {len(timestamp_groups)}, Sampled timestamps: {len(sampled_timestamps)}")

    for timestamp in tqdm(sampled_timestamps, desc="Creating snapshots"):
        group = edges_ddf[edges_ddf["timestamp"] == timestamp][["src_id", "dst_id", "edge_type"]].compute()
        if not group.empty:
            group = group[(group["src_id"] >= 0) & (group["dst_id"] >= 0) & (group["src_id"] < num_nodes) & (group["dst_id"] < num_nodes)]
            if not group.empty:
                edge_index = torch.tensor(group[["src_id", "dst_id"]].values, dtype=torch.long).t().to(device)
                max_index = edge_index.max().item()
                if max_index >= num_nodes:
                    print(f"Error: Snapshot at timestamp {timestamp} has max index {max_index}, expected < {num_nodes}")
                    continue
                edge_types = group["edge_type"].values
                edge_attr = torch.zeros((edge_index.shape[1], edge_type_features_tensor.shape[1]), dtype=torch.float32, device=device)
                for i, et in enumerate(edge_types):
                    if et in edge_type_to_idx:
                        edge_attr[i] = edge_type_features_tensor[edge_type_to_idx[et]]
                data = Data(
                    x=node_features_tensor.to(device),
                    edge_index=edge_index,
                    edge_attr=edge_attr,
                    t=int(timestamp),
                    num_nodes=num_nodes
                )
                temporal_data.append(data)
            else:
                print(f"Warning: Snapshot at timestamp {timestamp} is empty after filtering.")
        del group
        gc.collect()
    
    #del edge_type_features_tensor
    gc.collect()
    return temporal_data

In [19]:
print("\nCreating temporal graph with Dask...")
temporal_graph_sequence = create_temporal_graph_dask(
    edges_train_ddf, node_features_tensor, edge_type_features_df
)
print(f"NUMBER OF TEMPORAL SNAPSHOTS: {len(temporal_graph_sequence)}")


Creating temporal graph with Dask...
Total timestamps: 22510, Sampled timestamps: 113


Creating snapshots: 100%|██████████| 113/113 [01:20<00:00,  1.40it/s]

NUMBER OF TEMPORAL SNAPSHOTS: 113





In [20]:
#del edges_train_ddf
gc.collect()

0

In [21]:
# --- Pre-generate Negative Samples for Efficiency ---
def pre_generate_negative_samples(temporal_data, num_nodes, num_samples_per_snapshot=1000):
    neg_samples_cache = []
    for current_graph in tqdm(temporal_data, desc="Pre-generating negative samples"):
        edge_index = current_graph.edge_index.cpu()
        neg_edges = []
        attempts = 0
        max_attempts = num_samples_per_snapshot * 10
        while len(neg_edges) < num_samples_per_snapshot and attempts < max_attempts:
            src = np.random.randint(0, num_nodes)
            dst = np.random.randint(0, num_nodes)
            if src != dst and not torch.any((edge_index[0] == src) & (edge_index[1] == dst)):
                neg_edges.append([src, dst])
            attempts += 1
        neg_edges_tensor = torch.tensor(neg_edges, dtype=torch.long).t().to(device)
        neg_samples_cache.append(neg_edges_tensor)
    return neg_samples_cache

In [22]:
# --- Improved Model with Dropout ---
class TemporalLinkPredictor(torch.nn.Module):
    def __init__(self, num_node_features, num_edge_features, hidden_channels=64, dropout_rate=0.7):
        super(TemporalLinkPredictor, self).__init__()
        self.initial_fc = torch.nn.Linear(num_node_features, hidden_channels)
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.tgcn = TGCN(in_channels=hidden_channels, out_channels=hidden_channels)
        self.edge_fc = torch.nn.Linear(num_edge_features, hidden_channels)
        self.linear = torch.nn.Linear(hidden_channels * 2, 1)

    def forward(self, x, edge_index, edge_attr, src_ids, dst_ids):
        h = F.relu(self.initial_fc(x))
        h = self.dropout(h)
        h = F.relu(self.tgcn(h, edge_index))
        edge_h = F.relu(self.edge_fc(edge_attr))
        edge_h_agg = torch.zeros((num_nodes, self.tgcn.out_channels), device=x.device, dtype=torch.float16)
        edge_h_agg.index_add_(0, src_ids, edge_h.to(torch.float16))
        edge_h_agg.index_add_(0, dst_ids, edge_h.to(torch.float16))
        degree = torch.bincount(edge_index[0], minlength=num_nodes).clamp(min=1).float().to(x.device)
        edge_h_agg = edge_h_agg.to(torch.float32) / degree.view(-1, 1)
        h = torch.cat([h, edge_h_agg], dim=-1)
        out = self.linear(h)  # Logit çıktılar
        out_pairs = out[src_ids]
        return out_pairs.squeeze(-1)

In [56]:
# Initialize model
num_node_features = temporal_graph_sequence[0].x.shape[1] if temporal_graph_sequence else 0
num_edge_features = temporal_graph_sequence[0].edge_attr.shape[1] if temporal_graph_sequence else 0
model = TemporalLinkPredictor(num_node_features, num_edge_features, dropout_rate=0.3).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
pos_weight = torch.tensor([0.1], device=device)
criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
scaler = GradScaler()

In [57]:
# Pre-generate negative samples
print("\nPre-generating negative samples...")
neg_samples_cache = pre_generate_negative_samples(temporal_graph_sequence, num_nodes)


Pre-generating negative samples...


Pre-generating negative samples: 100%|██████████| 113/113 [00:04<00:00, 26.49it/s]


In [25]:
# --- Training with Positive and Negative Samples ---
def train(model, temporal_data, neg_samples_cache, optimizer, criterion, batch_size=64):
    model.train()
    total_loss = 0
    num_nodes = temporal_data[0].x.shape[0]
    
    for i in range(0, len(temporal_data) - 1, batch_size):
        batch_data = temporal_data[i:i + batch_size]
        batch_neg_samples = neg_samples_cache[i:i + batch_size]
        optimizer.zero_grad()
        batch_loss = 0
        for idx, current_graph in enumerate(batch_data):
            if current_graph.edge_index.numel() > 0 and current_graph.x is not None:
                x = current_graph.x.to(device)
                edge_index = current_graph.edge_index.to(device)
                edge_attr = current_graph.edge_attr.to(device)
                
                pos_edges = edge_index.t()[:min(200, edge_index.shape[1])]
                if pos_edges.shape[0] == 0:
                    continue
                pos_src = pos_edges[:, 0]
                pos_dst = pos_edges[:, 1]
                pos_labels = torch.ones(pos_src.shape[0], device=device)
                
                neg_edges = batch_neg_samples[idx]
                if neg_edges.shape[1] == 0:
                    continue
                neg_edges = neg_edges[:, :pos_src.shape[0]]  # Negatif örnek sayısını pozitifle eşitle
                neg_src = neg_edges[0]
                neg_dst = neg_edges[1]
                neg_labels = torch.zeros(neg_src.shape[0], device=device)
                
                src_ids = torch.cat([pos_src, neg_src])
                dst_ids = torch.cat([pos_dst, neg_dst])
                labels = torch.cat([pos_labels, neg_labels])
                
                valid_mask = (src_ids < num_nodes) & (dst_ids < num_nodes)
                if valid_mask.sum() == 0:
                    continue
                src_ids = src_ids[valid_mask]
                dst_ids = dst_ids[valid_mask]
                labels = labels[valid_mask]
                if len(src_ids) == 0:
                    continue
                
                batch_size = len(src_ids)
                edge_indices = torch.zeros(batch_size, dtype=torch.long, device=device)
                for i in range(batch_size):
                    src, dst = src_ids[i].item(), dst_ids[i].item()
                    edge_idx = (edge_index[0] == src) & (edge_index[1] == dst)
                    if edge_idx.any():
                        edge_indices[i] = edge_idx.nonzero(as_tuple=True)[0][0]
                    else:
                        edge_indices[i] = -1
                edge_attr_batch = torch.zeros((batch_size, num_edge_features), dtype=torch.float32, device=device)
                valid_edge_mask = edge_indices >= 0
                edge_attr_batch[valid_edge_mask] = edge_attr[edge_indices[valid_edge_mask]]
                
                if edge_attr_batch.shape[0] != len(src_ids):
                    print(f"Warning: edge_attr_batch shape {edge_attr_batch.shape} does not match src_ids length {len(src_ids)}")
                    continue
                
                with autocast():
                    out = model(x, edge_index, edge_attr_batch, src_ids, dst_ids)
                    loss = criterion(out, labels)
                
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                
                batch_loss += loss.item()
                
                del x, edge_index, edge_attr, pos_edges, neg_edges, src_ids, dst_ids, labels, out, edge_attr_batch
                torch.cuda.empty_cache()
                gc.collect()
        
        total_loss += batch_loss / len(batch_data)
        del batch_data, batch_neg_samples
        gc.collect()
    
    return total_loss / max(1, len(temporal_data) // batch_size)

In [26]:
# --- Visualization Functions ---
def visualize_preprocessing_flow():
    plt.figure(figsize=(10, 6))
    plt.title("Preprocessing Flow Diagram")
    plt.text(0.1, 0.7, "1. Load edges_train_A.csv with Dask\n2. Create node mapping\n3. Load node_features_sampled.csv\n4. Load edge_type_features.csv\n5. Load input_A_initial.csv", fontsize=12)
    plt.axis('off')
    plt.savefig("preprocessing_flow.png")
    plt.close()

In [27]:
def visualize_temporal_snapshot(temporal_data):
    plt.figure(figsize=(12, 8))
    plt.title("Temporal Snapshot Diagram")
    timestamps = [data.t for data in temporal_data]
    plt.hist(timestamps, bins=50, edgecolor='black')
    plt.xlabel("Timestamp")
    plt.ylabel("Snapshot Count")
    plt.savefig("temporal_snapshot.png")
    plt.close()

In [28]:
def visualize_model_architecture():
    plt.figure(figsize=(10, 6))
    plt.title("Model Architecture Diagram")
    plt.text(0.1, 0.7, "TemporalLinkPredictor:\n- Initial FC (Node Features)\n- TGCN Layer\n- Edge FC (Edge Features)\n- Linear Output", fontsize=12)
    plt.axis('off')
    plt.savefig("model_architecture.png")
    plt.close()

In [29]:
def visualize_embeddings(h, labels, title="Node Embeddings"):
    h = h.detach().cpu().numpy()
    labels = labels.cpu().numpy()
    z = TSNE(n_components=2).fit_transform(h)
    plt.figure(figsize=(10, 10))
    plt.scatter(z[:, 0], z[:, 1], c=labels, cmap="Set2", s=50)
    plt.title(title)
    plt.savefig(f"{title.replace(' ', '_')}.png")
    plt.close()

In [58]:
# Generate visualizations
print("\nGenerating preprocessing flow diagram...")
visualize_preprocessing_flow()
print("Preprocessing flow diagram saved as 'preprocessing_flow.png'")

print("\nGenerating temporal snapshot diagram...")
visualize_temporal_snapshot(temporal_graph_sequence)
print("Temporal snapshot diagram saved as 'temporal_snapshot.png'")

print("\nGenerating model architecture diagram...")
visualize_model_architecture()
print("Model architecture diagram saved as 'model_architecture.png'")


Generating preprocessing flow diagram...
Preprocessing flow diagram saved as 'preprocessing_flow.png'

Generating temporal snapshot diagram...
Temporal snapshot diagram saved as 'temporal_snapshot.png'

Generating model architecture diagram...
Model architecture diagram saved as 'model_architecture.png'


In [59]:
print("\nTraining model...")
losses = []
for epoch in range(5):
    loss = train(model, temporal_graph_sequence, neg_samples_cache, optimizer, criterion, batch_size=64)
    losses.append(loss)
    print(f"Epoch: {epoch}, Loss: {loss:.4f}")
    
    model.eval()
    with torch.no_grad():
        latest_graph = temporal_graph_sequence[-1]
        x = latest_graph.x.to(device)
        edge_index = latest_graph.edge_index.to(device)
        edge_attr = latest_graph.edge_attr.to(device)
        h = F.relu(model.initial_fc(x))
        h = F.relu(model.tgcn(h, edge_index))
        degrees = torch.bincount(latest_graph.edge_index[0], minlength=latest_graph.x.shape[0])
        visualize_embeddings(h, degrees, f"Node_Embeddings_Epoch_{epoch}")


Training model...
Epoch: 0, Loss: 0.3535
Epoch: 1, Loss: 0.1781
Epoch: 2, Loss: 0.1174
Epoch: 3, Loss: 0.0913
Epoch: 4, Loss: 0.0756


In [60]:
plt.figure(figsize=(8, 6))
plt.plot(losses)
plt.title("Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.savefig("training_loss.png")
plt.close()

In [61]:
# --- Prediction ---
print("\nPrediction on test data...")
model.eval()
output_a = []
true_labels = []  # Gerçek etiketleri toplamak için
predictions = []  # Tahminleri toplamak için
edge_type_to_idx = {et: idx for idx, et in enumerate(edge_type_features_df["edge_type"])}
edge_type_features_tensor = torch.tensor(
    edge_type_features_df.drop(columns=["edge_type"]).values, dtype=torch.float32
).to(device)


Prediction on test data...


In [62]:
temporal_graph_sequence.sort(key=lambda x: x.t)
snapshot_timestamps = [g.t for g in temporal_graph_sequence]

In [63]:
print(f"Snapshot timestamps range: min={min(snapshot_timestamps)}, max={max(snapshot_timestamps)}")
start_times = input_a_df["start_time"].values
print(f"Input start_time range: min={start_times.min()}, max={start_times.max()}")
outside_range_before = sum(start_times < min(snapshot_timestamps))
outside_range_after = sum(start_times > max(snapshot_timestamps))
print(f"Number of start_times before earliest snapshot: {outside_range_before}")
print(f"Number of start_times after latest snapshot: {outside_range_after}")

Snapshot timestamps range: min=1413662400, max=1494313200
Input start_time range: min=1494579608, max=1498946320
Number of start_times before earliest snapshot: 0
Number of start_times after latest snapshot: 8196


In [64]:
batch_size = 50
num_batches = (len(input_a_df) + batch_size - 1) // batch_size

In [65]:
with torch.no_grad():
    for batch_idx in tqdm(range(num_batches), desc="Predicting"):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, len(input_a_df))
        batch_df = input_a_df.iloc[start_idx:end_idx]
        
        src_ids = []
        dst_ids = []
        edge_types = []
        start_times = []
        labels = []  # Gerçek etiketleri toplamak için
        batch_output = []
        snapshots = []

        for idx, row in batch_df.iterrows():
            src_id = int(row["src_id"])
            dst_id = int(row["dst_id"])
            edge_type = int(row["edge_type"])
            start_time = row["start_time"]
            label = int(row["label"])  # Label sütununu al
            
            if src_id >= num_nodes or dst_id >= num_nodes or src_id < 0 or dst_id < 0:
                print(f"Invalid node indices at row {start_idx + idx}: src_id={src_id}, dst_id={dst_id}, num_nodes={num_nodes}")
                batch_output.append(0.5)
                continue
            
            relevant_history = None
            for i in range(len(snapshot_timestamps) - 1, -1, -1):
                if snapshot_timestamps[i] <= start_time:
                    relevant_history = temporal_graph_sequence[i]
                    break
            
            if relevant_history is None:
                print(f"No snapshot found for start_time={start_time} at row={start_idx + idx}, using latest snapshot")
                relevant_history = temporal_graph_sequence[-1]
            
            src_ids.append(src_id)
            dst_ids.append(dst_id)
            edge_types.append(edge_type)
            start_times.append(start_time)
            labels.append(label)  # Gerçek etiketi ekle
            snapshots.append(relevant_history)
            batch_output.append(None)
        
        if not src_ids:
            output_a.extend([0.5 if x is None else x for x in batch_output])
            gc.collect()
            continue

        src_ids_tensor = torch.tensor(src_ids, device=device)
        dst_ids_tensor = torch.tensor(dst_ids, device=device)
        
        batch_size_valid = len(edge_types)
        for i in range(batch_size_valid):
            snapshot = snapshots[i]
            x = snapshot.x.to(device)
            edge_index = snapshot.edge_index.to(device)
            edge_attr = torch.zeros((1, num_edge_features), dtype=torch.float32, device=device)
            edge_type_idx = edge_type_to_idx.get(edge_types[i], 0)
            edge_attr[0] = edge_type_features_tensor[edge_type_idx]
            
            src_id_tensor = src_ids_tensor[i:i+1]
            dst_id_tensor = dst_ids_tensor[i:i+1]
            with autocast():
                out = model(x, edge_index, edge_attr, src_id_tensor, dst_id_tensor)
                out = torch.sigmoid(out)  # Olasılık değerleri
            batch_output[i] = out.item()
            
            # Gerçek etiket ve tahmini topla
            true_labels.append(labels[i])  # Gerçek label’i kullan
            predictions.append(out.item())
        
        batch_output = [0.5 if x is None else x for x in batch_output]
        output_a.extend(batch_output)
        gc.collect()

Predicting: 100%|██████████| 164/164 [02:24<00:00,  1.13it/s]


In [66]:
# --- AUC Hesaplama ve ROC-AUC Grafiği ---
print("\nCalculating AUC and plotting ROC curve...")
all_labels = true_labels
all_predictions = predictions


Calculating AUC and plotting ROC curve...


In [67]:
# ROC-AUC skoru hesapla
roc_auc = roc_auc_score(all_labels, all_predictions)
print(f"AUC Score (Overall): {roc_auc:.4f}")

AUC Score (Overall): 0.5302


In [68]:
# ROC eğrisi ve optimal threshold
fpr, tpr, thresholds = roc_curve(all_labels, all_predictions)

In [69]:
# Optimal threshold'u bul (Youden's J istatistiği: TPR - FPR maksimize edilir)
J = tpr - fpr
optimal_idx = np.argmax(J)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal Threshold: {optimal_threshold:.4f}")

Optimal Threshold: 0.4014


In [70]:
# ROC eğrisini çiz
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.4f})")
plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.scatter(fpr[optimal_idx], tpr[optimal_idx], color='red', label=f"Optimal Threshold ({optimal_threshold:.2f})")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC-AUC Curve")
plt.legend()
plt.savefig("roc_auc_curve.png")
plt.close()

In [71]:
# Optimal threshold ile doğruluk hesapla
predictions_binary = [1 if p >= optimal_threshold else 0 for p in all_predictions]
accuracy = sum(1 for true, pred in zip(all_labels, predictions_binary) if true == pred) / len(all_labels)
print(f"Accuracy at Optimal Threshold ({optimal_threshold:.4f}): {accuracy:.4f}")

Accuracy at Optimal Threshold (0.4014): 0.5178


In [72]:
# --- Save Model ---
print("\nSaving trained model...")
torch.save(model.state_dict(), "trained_model.pth")
print("Model saved as 'trained_model.pth'")


Saving trained model...
Model saved as 'trained_model.pth'


In [73]:
# --- Save Output ---
print("\nSaving output...")
output_df = pd.DataFrame({"probability": output_a})
output_df["probability"] = output_df["probability"].astype(float)
output_df.to_csv("output_a.csv", index=False, header=False, na_rep="0.5")
print("Output saved to output_a.csv")
gc.collect()


Saving output...
Output saved to output_a.csv


4274