In [58]:

import pandas as pd
import numpy as np



CSV_PATH = r"C:\Users\pc\Desktop\30_1000_base.csv" 
SEED = 7
OBS_FRACTION  = 1
HOLDOUT_FRACTION   = 0.7
M_PROBES = 256


df = pd.read_csv(CSV_PATH)

N  = df.shape[0]
y = df.iloc[:,0].to_numpy().astype(np.float32)
S = df.iloc[:,1:].to_numpy(dtype= float)

def upper_pairs(N):
    
    I, J = np.triu_indices(N, k=1)
    return np.stack([I, J], axis=1)

all_pairs  = upper_pairs(N)

total = len(all_pairs)

obs = int(OBS_FRACTION * total)

idx = np.random.RandomState(SEED).choice(total, size = obs , replace = False)

obs_pairs_all = all_pairs[idx]

obs_values_all = S[obs_pairs_all[:,0] ,obs_pairs_all[:,1] ]

rng = np.random.RandomState(SEED)

perm = rng.permutation(len(obs_pairs_all))

hold = int(HOLDOUT_FRACTION * len(obs_pairs_all) )

train_obs_idx = perm[hold :]

hold_idx = perm[:hold]

holdout_pairs = obs_pairs_all[hold_idx]
holdout_vals  = obs_values_all [hold_idx]

obs_pairs = obs_pairs_all[train_obs_idx]
obs_vals = obs_values_all[train_obs_idx]

obs_set = set(map(tuple, obs_pairs_all.tolist()))
unobs_pairs = np.array([p for p in all_pairs.tolist() if tuple(p) not in obs_set], dtype=np.int64)

print(f"Observed (train) pairs={len(obs_pairs)} | Holdout (pairs)={len(holdout_pairs)} | Unobserved={len(unobs_pairs)}")





Observed (train) pairs=149850 | Holdout (pairs)=349650 | Unobserved=0


### FP 

In [59]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import re, hashlib

formulas = list(df.columns[1:])
# ---------------- Propositional Parser & Robust FP (unseen atoms handled) ----------------
OP_MAP = {"→":" IMP ", "⇒":" IMP ", "=>":" IMP ", "->":" IMP ",
          "↔":" IFF ", "<=>":" IFF ", "<->":" IFF ",
          "⊑":" SUB ",  # treat as IMP
          "⊓":" AND ", "∧":" AND ", "&&":" AND ",
          "⊔":" OR  ", "∨":" OR  ", "||":" OR  ",
          "¬":" NOT ", "~":" NOT ", "!":" NOT "}
BIN_OPS, UNARY_OPS = {"AND","OR","IMP","IFF","SUB"}, {"NOT"}
TOKEN_RE = re.compile(r"[A-Za-z0-9_]+|[()]")

def norm_text(s):
    s = str(s)
    for k,v in OP_MAP.items(): s = s.replace(k,v)
    return s

def lex(s): return TOKEN_RE.findall(norm_text(s))
def is_atom(t): return t not in BIN_OPS|UNARY_OPS|{"(",")"}

class Parser:
    def __init__(self,toks): self.toks=toks; self.i=0
    def peek(self): return self.toks[self.i] if self.i<len(self.toks) else None
    def pop(self): t=self.peek(); self.i += (1 if t is not None else 0); return t
    def parse(self): return self.expr(0)
    PREC = {"IFF":1,"IMP":2,"SUB":2,"OR":3,"AND":4}
    RIGHT = {"IMP","IFF","SUB"}
    def expr(self,minp):
        node=self.unary()
        while True:
            op=self.peek()
            if op in BIN_OPS:
                prec=self.PREC.get(op,0)
                if prec<minp: break
                self.pop()
                nextp = prec if op in self.RIGHT else prec+1
                rhs=self.expr(nextp)
                node=("BIN",op,node,rhs)
            else: break
        return node
    def unary(self):
        t=self.peek()
        if t in UNARY_OPS:
            self.pop(); c=self.unary(); return ("UN",t,c)
        if t=="(":
            self.pop(); n=self.expr(0); assert self.pop()==")","Missing ')'"
            return n
        a=self.pop()
        return ("ATOM", a if a is not None else "x")

def parse_formula(s):
    try: return Parser(lex(s)).parse()
    except: return ("ATOM","x")

def atoms_in(node, acc=None):
    if acc is None: acc=set()
    k=node[0]
    if k=="ATOM": acc.add(node[1]); return acc
    if k=="UN": return atoms_in(node[2], acc)
    if k=="BIN": atoms_in(node[2], acc); atoms_in(node[3], acc); return acc
    return acc

def depth(node):
    k=node[0]
    if k=="ATOM": return 1
    if k=="UN": return 1+depth(node[2])
    if k=="BIN": return 1+max(depth(node[2]), depth(node[3]))
    return 1

# --- Deterministic hashing for unseen atoms (probe-consistent) ---
def _u64_from_str(s: str) -> int:
    h = hashlib.blake2b(s.encode('utf-8'), digest_size=8).digest()
    return int.from_bytes(h, 'big')

def bernoulli_from_name(atom: str, probe_idx: int, p: float, seed: int) -> bool:
    u = (_u64_from_str(f"{atom}|{probe_idx}|{seed}") % (1<<53)) / float(1<<53)
    return u < p

class ProbeEnv:
    def __init__(self, base_env: dict, probe_idx: int, bias_p: float, seed: int):
        self.base = base_env
        self.m    = probe_idx
        self.p    = float(bias_p)
        self.seed = int(seed)
        self.cache = {}
    def get(self, atom: str) -> bool:
        if atom in self.base: return bool(self.base[atom])
        if atom in self.cache: return self.cache[atom]
        v = bernoulli_from_name(atom, self.m, self.p, self.seed)
        self.cache[atom] = v
        return v

def eval_ast(node, env_obj):
    k=node[0]
    if k=="ATOM": return bool(env_obj.get(node[1]))
    if k=="UN":
        _,op,c = node
        v = eval_ast(c, env_obj)
        return (not v)
    if k=="BIN":
        _,op,l,r = node
        a = eval_ast(l, env_obj); b = eval_ast(r, env_obj)
        if op=="AND": return a and b
        if op=="OR":  return a or b
        if op in ("IMP","SUB"): return (not a) or b
        if op=="IFF": return a==b
    return False

# ---------------- Build semantic fingerprint (FP) ----------------
asts = [parse_formula(s) for s in formulas]
all_atoms = sorted(set().union(*[atoms_in(t) for t in asts]))
A = len(all_atoms)
print(f"#Atoms found: {A}")

# probes (half bias 0.3, half 0.7)
rng = np.random.default_rng(SEED)
biases = np.concatenate([np.full(M_PROBES//2, 0.3), np.full(M_PROBES - M_PROBES//2, 0.7)])
assignments = []
for p in biases:
    vals = rng.random(A) < p
    env = {a: bool(v) for a,v in zip(all_atoms, vals)}  # known atoms only
    assignments.append(env)

# Truth matrix T: N x M_PROBES with unseen atoms handled via ProbeEnv
T_mat = np.zeros((N, M_PROBES), dtype=np.float32)
for i,ast in enumerate(asts):
    for m_i,base_env in enumerate(assignments):
        env_obj = ProbeEnv(base_env, probe_idx=m_i, bias_p=biases[m_i], seed=SEED)
        T_mat[i,m_i] = 1.0 if eval_ast(ast, env_obj) else 0.0

# Structural features
def op_counts(toks):
    return toks.count("AND"), toks.count("OR"), toks.count("NOT"), toks.count("IMP")+toks.count("SUB"), toks.count("IFF")
struct_rows=[]
for s,ast in zip(formulas,asts):
    toks = lex(s)
    ac = len(atoms_in(ast, set()))
    d  = depth(ast)
    c_and, c_or, c_not, c_imp, c_iff = op_counts(toks)
    struct_rows.append([ac, d, c_and, c_or, c_not, c_imp, c_iff, len(toks)])
STRUCT = np.array(struct_rows, dtype=np.float32)

# Final FP
FP = np.concatenate([T_mat, STRUCT], axis=1).astype(np.float32)
print("Fingerprint shape:", FP.shape)

# ---------------- FP preprocessing for models ----------------
# For GNN encoder features (lowered dimension for stability)
sc_fp_gnn = StandardScaler().fit(FP)
FP_std_g  = sc_fp_gnn.transform(FP).astype(np.float32)
pca_gnn   = PCA(n_components=min(256, FP_std_g.shape[1]), whiten=True, random_state=SEED).fit(FP_std_g)
FP_low    = pca_gnn.transform(FP_std_g).astype(np.float32)


#Atoms found: 30
Fingerprint shape: (1000, 264)


In [60]:

# ---------------- PyG GNN for Edge Prediction ----------------
import torch, torch.nn as nn
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv 
import random
import numpy as np
import torch
from scipy.stats import pearsonr, spearmanr
import torch.nn.functional as F

SEED = 7 

# Python random
random.seed(SEED)

# NumPy
np.random.seed(SEED)

# PyTorch (CPU)
torch.manual_seed(SEED)
torch.use_deterministic_algorithms(True)

# Node features (N, d)
x = torch.tensor(FP_low, dtype=torch.float32)

# Training edges (observed)
edge_index = torch.tensor(obs_pairs.T, dtype=torch.long) 
edge_label = torch.tensor(obs_vals, dtype=torch.float32)

# Holdout edges (for validation)
val_edge_index = torch.tensor(holdout_pairs.T, dtype=torch.long)
val_edge_label = torch.tensor(holdout_vals, dtype=torch.float32)

data = Data(x=x, edge_index=edge_index)

# -------- GNN Encoder  --------
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
       
        self.conv1 = SAGEConv(-1, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
       
        
    def forward(self, x, edge_index):
        x1 = self.conv1(x, edge_index)
        x2= self.conv2(x1,edge_index) + x1
        return x2

# -------- Edge Decoder --------
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.mlp = nn.Sequential(
            #nn.Linear( 2 * hidden_channels, hidden_channels),
            #nn.ReLU(),
            nn.Linear(hidden_channels, hidden_channels),
            nn.ReLU(), 
            nn.Linear(hidden_channels, 1),
            nn.Sigmoid(),
        )

    def forward(self, z, edge_idx):
        row, col = edge_idx
        zz = torch.abs(z[row]- z[col])
       
        ##zz = z[row] *z[col]
        
        return self.mlp(zz).view(-1)
# -------- Full Model --------a
class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x, edge_index, edge_label_index):
        z = self.encoder(x, edge_index)
        return self.decoder(z, edge_label_index)

model = Model(hidden_channels=32)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


# -------- Training --------

def train():
    model.train()
    optimizer.zero_grad()
    pred = model(data.x, data.edge_index, data.edge_index)
    loss = F.mse_loss(pred, edge_label)
    loss.backward()
    optimizer.step()
    return float(loss)

from sklearn.metrics import r2_score

@torch.no_grad()
def test():
    model.eval()
    
    # Predict on holdout edges
    pred = model(data.x, data.edge_index, val_edge_index)
    

    target = val_edge_label

    # Convert to numpy
    pred_np = pred.numpy()
    target_np = target.numpy()

    # --- Regression metrics ---
    rmse = F.mse_loss(pred, target).sqrt().item()
    r2 = r2_score(target_np, pred_np)


    # --- Correlation metrics ---
    pearson_corr, _ = pearsonr(target_np, pred_np)
    spearman_corr, _ = spearmanr(target_np, pred_np)

    return rmse, r2,  pearson_corr, spearman_corr

# ---------------- TRAINING LOOP ----------------
for epoch in range(1, 301):
    loss = train()
    rmse, r2, pearson_corr, spearman_corr = test()

    if epoch % 10 == 0 or epoch == 1:
        print(
            f"Epoch {epoch:03d} | "
            f"Loss: {loss:.6f} | "
            f"RMSE: {rmse:.6f} | "
            f"R2: {r2:.4f} | "
            f"Pearson: {pearson_corr:.4f} | "
            f"Spearman: {spearman_corr:.4f}"
        )
# ---------------- Compute predicted similarities for all pairs ----------------

N = x.size(0)

def upper_pairs(N):
    
    I, J = np.triu_indices(N, k=1)
    return np.stack([I, J], axis=1)

all_pairs  = upper_pairs(N)

obs_set = set((int(i), int(j)) for (i, j) in obs_pairs)

missing_pairs_list =[]

for i, j in zip(all_pairs[:,0].tolist(), all_pairs[:,1].tolist()):
    if (i, j) not in obs_set:
        missing_pairs_list.append((i, j))

missing_pairs = torch.tensor(missing_pairs_list, dtype=torch.long).T


# ------------------ 4. Predict values for MISSING PAIRS ONLY ---------------
with torch.no_grad():
    model.eval()
    pred_missing = model(x, edge_index, missing_pairs)
   

# ------------------ 5. Initialize full N x N similarity matrix -------------
full_sim_matrix = torch.zeros((N, N), dtype=torch.float32)

# ------------------ 6. Fill observed edges with TRUE values ----------------
for (i, j), val in zip(obs_pairs, obs_vals):
    full_sim_matrix[i, j] = val
    full_sim_matrix[j, i] = val  

# ------------------ 7. Fill missing edges with PREDICTED values -----------
for (i, j), pred in zip(missing_pairs_list, pred_missing):
    full_sim_matrix[i, j] = pred
    full_sim_matrix[j, i] = pred  

# ------------------ final result: full matrix -------------------------------
print("Full similarity matrix shape:", full_sim_matrix.shape)

# ---------------- Build final node features ----------------
#final_features = torch.cat([x.to(device), full_sim_matrix], dim=1)
final_features = x

print("Final node features:", final_features.shape)

# ---------------- Train/test split ----------------
node_labels = torch.tensor(y, dtype=torch.float32)
N = node_labels.size(0)
perm = torch.randperm(N,)
train_size = int(0.8 * N)
train_idx = perm[:train_size]
test_idx = perm[train_size:]

X_train = final_features[train_idx]
y_train = node_labels[train_idx]
X_test  = final_features[test_idx]
y_test  = node_labels[test_idx]

# ---------------- Node Classifier ----------------
class NodeClassifier(nn.Module):
    def __init__(self, input_dim, hidden=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, 1),
            
        )

    def forward(self, x):
        return self.net(x).view(-1)

clf = NodeClassifier(input_dim=final_features.size(1))
optimizer2 = torch.optim.Adam(clf.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

# ---------------- Train node classifier ----------------
print("=== TRAINING NODE CLASSIFIER ===")
for epoch in range(1, 301):
    optimizer2.zero_grad()
    logits = clf(X_train)
    loss = criterion(logits, y_train)
    loss.backward()
    optimizer2.step()
    

# ---------------- Evaluate ----------------
with torch.no_grad():
    logits = clf(X_test)
    probs = torch.sigmoid(logits)
    y_pred = (probs > 0.5).long().numpy()
    y_true = y_test.numpy()

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)

print("\n=== FINAL NODE LABEL METRICS ===")
print(f"Accuracy : {acc:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")


Epoch 001 | Loss: 0.014501 | RMSE: 0.171666 | R2: -1.2215 | Pearson: -0.0040 | Spearman: 0.0041
Epoch 010 | Loss: 0.013127 | RMSE: 0.114054 | R2: 0.0194 | Pearson: 0.1712 | Spearman: 0.1534
Epoch 020 | Loss: 0.009886 | RMSE: 0.097683 | R2: 0.2807 | Pearson: 0.5955 | Spearman: 0.5358
Epoch 030 | Loss: 0.006801 | RMSE: 0.081956 | R2: 0.4937 | Pearson: 0.7031 | Spearman: 0.6706
Epoch 040 | Loss: 0.004918 | RMSE: 0.070726 | R2: 0.6229 | Pearson: 0.7915 | Spearman: 0.7185
Epoch 050 | Loss: 0.004102 | RMSE: 0.065253 | R2: 0.6790 | Pearson: 0.8247 | Spearman: 0.7328
Epoch 060 | Loss: 0.003621 | RMSE: 0.062422 | R2: 0.7063 | Pearson: 0.8419 | Spearman: 0.7443
Epoch 070 | Loss: 0.003352 | RMSE: 0.059639 | R2: 0.7319 | Pearson: 0.8562 | Spearman: 0.7527
Epoch 080 | Loss: 0.003013 | RMSE: 0.057415 | R2: 0.7515 | Pearson: 0.8672 | Spearman: 0.7594
Epoch 090 | Loss: 0.002800 | RMSE: 0.056109 | R2: 0.7627 | Pearson: 0.8739 | Spearman: 0.7620
Epoch 100 | Loss: 0.002716 | RMSE: 0.054679 | R2: 0.7746 |