In [1]:
# ================== ONE CELL: C-simple + FP→Z + Truth ==================
# If needed (Colab): !pip -q install numpy pandas scipy scikit-learn torch
import os, re, random, hashlib, numpy as np, pandas as pd

from scipy.stats import pearsonr, spearmanr
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import torch, torch.nn as nn

# ---------------- CONFIG ----------------
CSV_PATH           = r"C:\Users\pc\Desktop\30_1000_base.csv"  # first column y, then N formula columns (S)
SEED               = 7
DIM                = 128               # embedding dim for spectral Z and student
M_PROBES           = 256               # # of semantic mini-worlds (fingerprint)
OBS_FRACTION       = 0.10              # used only if CSV has no missing entries (to simulate partial observation)
HOLDOUT_FRACTION   = 0.10              # fraction of observed pairs kept for leak-free eval of completion
PAIR_EVAL          = 20000             # #pairs to sample for correlation sanity checks
# GNN (C-simple)
EPOCHS_C           = 3000
BATCH_EDGES_C      = 40000
LR_C               = 1e-3
APPNP_K            = 10
APPNP_ALPHA        = 0.1
EDGE_TEMP          = 1.0               # keep 1.0 for pure regression; 1.5–2.0 to sharpen propagation later
BLOCK_PRED         = 128               # block size for full kernel prediction
# Student (FP→Z)
EPOCHS_STUDENT     = 40
LR_STUDENT         = 2e-3
PAIR_SAMPLES       = 2048              # pairwise cosine samples per batch
PAIR_LOSS_W        = 0.5               # weight for pairwise loss vs vector MSE
BATCH_STUDENT      = 512
# Truth head
EPOCHS_TRUTH       = 20
LR_TRUTH           = 1e-3
BATCH_TRUTH        = 512
# Saving
SAVE_ARTIFACTS     = True
OUT_DIR            = "./"

# ---------------- Repro & Device ----------------
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------------- Load CSV ----------------
assert os.path.exists(CSV_PATH), f"File not found: {CSV_PATH}"
df = pd.read_csv(CSV_PATH)
N  = df.shape[0]
formulas = list(df.columns[1:])
assert len(formulas) == N, "Expected N rows and N formula columns (1..N)."
y = df.iloc[:, 0].to_numpy().astype(np.float32)
S_raw = df.iloc[:, 1:].to_numpy(dtype=float)  # may include NaN for unobserved pairs

# Ensure symmetry (even if sparse) but keep NaNs where both missing
S = 0.5*(S_raw + S_raw.T)
diag = np.eye(N, dtype=bool)
S[diag] = 1.0
# Clip known entries into [0,1]
mask_known = ~np.isnan(S)
S[mask_known] = np.clip(S[mask_known], 0.0, 1.0)
y_bin = (y > 0.5).astype(np.float32)
print(f"N={N} | y∈[0,1] | S(min,max) among known={np.nanmin(S):.3f}/{np.nanmax(S):.3f}")

# ---------------- Build observed/unobserved/holdout pairs ----------------
def upper_pairs(N):
    # return all (i,j) with i<j
    I, J = np.triu_indices(N, k=1)
    return np.stack([I, J], axis=1)

all_pairs_u = upper_pairs(N)

# If the CSV is dense (no NaNs), simulate a partial observation set
if np.isnan(S).sum() == 0:
    total = len(all_pairs_u)
    m_obs = max(1, int(OBS_FRACTION * total))
    idx = np.random.RandomState(SEED).choice(total, size=m_obs, replace=False)
    obs_pairs_all = all_pairs_u[idx]
    obs_vals_all  = S[obs_pairs_all[:,0], obs_pairs_all[:,1]]
else:
    # Use only pairs with known values (i<j)
    Kmask = (~np.isnan(S)) & (~np.eye(N, dtype=bool))
    I, J = np.where(np.triu(Kmask, k=1))
    obs_pairs_all = np.stack([I, J], axis=1)
    obs_vals_all  = S[I, J].astype(np.float32)

# Split observed into train-observed vs holdout-observed for leak-free completion eval
rng = np.random.RandomState(SEED)
perm = rng.permutation(len(obs_pairs_all))
m_hold = max(1, int(HOLDOUT_FRACTION * len(obs_pairs_all)))
hold_idx = perm[:m_hold]; train_obs_idx = perm[m_hold:]
holdout_pairs = obs_pairs_all[hold_idx]
holdout_true  = obs_vals_all[hold_idx]
obs_pairs     = obs_pairs_all[train_obs_idx]
obs_vals      = obs_vals_all[train_obs_idx]

# Unobserved = all remaining upper pairs not in obs_pairs_all
obs_set = set(map(tuple, obs_pairs_all.tolist()))
unobs_pairs = np.array([p for p in all_pairs_u.tolist() if tuple(p) not in obs_set], dtype=np.int64)

print(f"Observed (train) pairs={len(obs_pairs)} | Holdout (pairs)={len(holdout_pairs)} | Unobserved={len(unobs_pairs)}")

# ---------------- Propositional Parser & Robust FP (unseen atoms handled) ----------------
OP_MAP = {"→":" IMP ", "⇒":" IMP ", "=>":" IMP ", "->":" IMP ",
          "↔":" IFF ", "<=>":" IFF ", "<->":" IFF ",
          "⊑":" SUB ",  # treat as IMP
          "⊓":" AND ", "∧":" AND ", "&&":" AND ",
          "⊔":" OR  ", "∨":" OR  ", "||":" OR  ",
          "¬":" NOT ", "~":" NOT ", "!":" NOT "}
BIN_OPS, UNARY_OPS = {"AND","OR","IMP","IFF","SUB"}, {"NOT"}
TOKEN_RE = re.compile(r"[A-Za-z0-9_]+|[()]")

def norm_text(s):
    s = str(s)
    for k,v in OP_MAP.items(): s = s.replace(k,v)
    return s

def lex(s): return TOKEN_RE.findall(norm_text(s))
def is_atom(t): return t not in BIN_OPS|UNARY_OPS|{"(",")"}

class Parser:
    def __init__(self,toks): self.toks=toks; self.i=0
    def peek(self): return self.toks[self.i] if self.i<len(self.toks) else None
    def pop(self): t=self.peek(); self.i += (1 if t is not None else 0); return t
    def parse(self): return self.expr(0)
    PREC = {"IFF":1,"IMP":2,"SUB":2,"OR":3,"AND":4}
    RIGHT = {"IMP","IFF","SUB"}
    def expr(self,minp):
        node=self.unary()
        while True:
            op=self.peek()
            if op in BIN_OPS:
                prec=self.PREC.get(op,0)
                if prec<minp: break
                self.pop()
                nextp = prec if op in self.RIGHT else prec+1
                rhs=self.expr(nextp)
                node=("BIN",op,node,rhs)
            else: break
        return node
    def unary(self):
        t=self.peek()
        if t in UNARY_OPS:
            self.pop(); c=self.unary(); return ("UN",t,c)
        if t=="(":
            self.pop(); n=self.expr(0); assert self.pop()==")","Missing ')'"
            return n
        a=self.pop()
        return ("ATOM", a if a is not None else "x")

def parse_formula(s):
    try: return Parser(lex(s)).parse()
    except: return ("ATOM","x")

def atoms_in(node, acc=None):
    if acc is None: acc=set()
    k=node[0]
    if k=="ATOM": acc.add(node[1]); return acc
    if k=="UN": return atoms_in(node[2], acc)
    if k=="BIN": atoms_in(node[2], acc); atoms_in(node[3], acc); return acc
    return acc

def depth(node):
    k=node[0]
    if k=="ATOM": return 1
    if k=="UN": return 1+depth(node[2])
    if k=="BIN": return 1+max(depth(node[2]), depth(node[3]))
    return 1

# --- Deterministic hashing for unseen atoms (probe-consistent) ---
def _u64_from_str(s: str) -> int:
    h = hashlib.blake2b(s.encode('utf-8'), digest_size=8).digest()
    return int.from_bytes(h, 'big')

def bernoulli_from_name(atom: str, probe_idx: int, p: float, seed: int) -> bool:
    u = (_u64_from_str(f"{atom}|{probe_idx}|{seed}") % (1<<53)) / float(1<<53)
    return u < p

class ProbeEnv:
    def __init__(self, base_env: dict, probe_idx: int, bias_p: float, seed: int):
        self.base = base_env
        self.m    = probe_idx
        self.p    = float(bias_p)
        self.seed = int(seed)
        self.cache = {}
    def get(self, atom: str) -> bool:
        if atom in self.base: return bool(self.base[atom])
        if atom in self.cache: return self.cache[atom]
        v = bernoulli_from_name(atom, self.m, self.p, self.seed)
        self.cache[atom] = v
        return v

def eval_ast(node, env_obj):
    k=node[0]
    if k=="ATOM": return bool(env_obj.get(node[1]))
    if k=="UN":
        _,op,c = node
        v = eval_ast(c, env_obj)
        return (not v)
    if k=="BIN":
        _,op,l,r = node
        a = eval_ast(l, env_obj); b = eval_ast(r, env_obj)
        if op=="AND": return a and b
        if op=="OR":  return a or b
        if op in ("IMP","SUB"): return (not a) or b
        if op=="IFF": return a==b
    return False

# ---------------- Build semantic fingerprint (FP) ----------------
asts = [parse_formula(s) for s in formulas]
all_atoms = sorted(set().union(*[atoms_in(t) for t in asts]))
A = len(all_atoms)
print(f"#Atoms found: {A}")

# probes (half bias 0.3, half 0.7)
rng = np.random.default_rng(SEED)
biases = np.concatenate([np.full(M_PROBES//2, 0.3), np.full(M_PROBES - M_PROBES//2, 0.7)])
assignments = []
for p in biases:
    vals = rng.random(A) < p
    env = {a: bool(v) for a,v in zip(all_atoms, vals)}  # known atoms only
    assignments.append(env)

# Truth matrix T: N x M_PROBES with unseen atoms handled via ProbeEnv
T_mat = np.zeros((N, M_PROBES), dtype=np.float32)
for i,ast in enumerate(asts):
    for m_i,base_env in enumerate(assignments):
        env_obj = ProbeEnv(base_env, probe_idx=m_i, bias_p=biases[m_i], seed=SEED)
        T_mat[i,m_i] = 1.0 if eval_ast(ast, env_obj) else 0.0

# Structural features
def op_counts(toks):
    return toks.count("AND"), toks.count("OR"), toks.count("NOT"), toks.count("IMP")+toks.count("SUB"), toks.count("IFF")
struct_rows=[]
for s,ast in zip(formulas,asts):
    toks = lex(s)
    ac = len(atoms_in(ast, set()))
    d  = depth(ast)
    c_and, c_or, c_not, c_imp, c_iff = op_counts(toks)
    struct_rows.append([ac, d, c_and, c_or, c_not, c_imp, c_iff, len(toks)])
STRUCT = np.array(struct_rows, dtype=np.float32)

# Final FP
FP = np.concatenate([T_mat, STRUCT], axis=1).astype(np.float32)
print("Fingerprint shape:", FP.shape)

# ---------------- FP preprocessing for models ----------------
# For GNN encoder features (lowered dimension for stability)
sc_fp_gnn = StandardScaler().fit(FP)
FP_std_g  = sc_fp_gnn.transform(FP).astype(np.float32)
pca_gnn   = PCA(n_components=min(256, FP_std_g.shape[1]), whiten=True, random_state=SEED).fit(FP_std_g)
FP_low    = pca_gnn.transform(FP_std_g).astype(np.float32)


Device: cpu
N=1000 | y∈[0,1] | S(min,max) among known=0.000/1.000
Observed (train) pairs=44955 | Holdout (pairs)=4995 | Unobserved=449550
#Atoms found: 30
Fingerprint shape: (1000, 264)


In [None]:
# ---------------- PyG GNN for Edge Prediction ----------------
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import random
import numpy as np
import torch

import torch.nn.functional as F

SEED = 7 

# Python random
random.seed(SEED)

# NumPy
np.random.seed(SEED)

# PyTorch (CPU)
torch.manual_seed(SEED)
torch.use_deterministic_algorithms(True)

# Node features (N, d)
x = torch.tensor(FP_low, dtype=torch.float32)

# Training edges (observed)
edge_index = torch.tensor(obs_pairs.T, dtype=torch.long)  # (2, E)
edge_label = torch.tensor(obs_vals, dtype=torch.float32)

# Holdout edges (for validation)
val_edge_index = torch.tensor(holdout_pairs.T, dtype=torch.long)
val_edge_label = torch.tensor(holdout_true, dtype=torch.float32)

data = Data(x=x, edge_index=edge_index)

# -------- GNN Encoder  --------
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
       
        self.conv1 = SAGEConv(-1, hidden_channels)
        self.conv2 = SAGEConv( hidden_channels,out_channels)
       # self.conv3 = SAGEConv( hidden_channels,out_channels)
     
    def forward(self, x, edge_index):
        self.dropout = nn.Dropout(0.2)
        x1 = self.conv1(x, edge_index).relu()
        
        x2 = self.conv2(x1, edge_index).relu() + x1
        
       # x3 = self.conv3(x2, edge_index).relu() +x2
    
        return x2

# -------- Edge Decoder --------
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(2*hidden_channels, hidden_channels),
            
            nn.ReLU(),
            nn.Linear(hidden_channels, hidden_channels//2),
            

            nn.ReLU(),
            nn.Linear(hidden_channels//2, 1),
           
            nn.Sigmoid() 
        )

    def forward(self, z, edge_idx):
        row, col = edge_idx
        zz = torch.cat([ torch.abs(z[row]-z[col]),z[col]*z[row]],dim=-1)
        #zz= torch.abs(z[row]-z[col])
        return self.mlp(zz).view(-1)

# -------- Full Model --------
class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x, edge_index, edge_label_index):
        z = self.encoder(x, edge_index)
        return self.decoder(z, edge_label_index)

model = Model(hidden_channels=32).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


# -------- Training --------

def train():
    model.train()
    optimizer.zero_grad()
    pred = model(data.x, data.edge_index, data.edge_index)
    loss = F.mse_loss(pred, edge_label)
    loss.backward()
    optimizer.step()
    return float(loss)

from sklearn.metrics import r2_score

@torch.no_grad()
def test():
    model.eval()
    
    # Predict on holdout edges
    pred = model(data.x, data.edge_index, val_edge_index)
    pred = pred.clamp(0, 1)  # keep in [0,1]

    target = val_edge_label

    # Convert to numpy
    pred_np = pred.numpy()
    target_np = target.numpy()

    # --- Regression metrics ---
    rmse = F.mse_loss(pred, target).sqrt().item()
    r2 = r2_score(target_np, pred_np)


    # --- Correlation metrics ---
    pearson_corr, _ = pearsonr(target_np, pred_np)
    spearman_corr, _ = spearmanr(target_np, pred_np)

    return rmse, r2,  pearson_corr, spearman_corr

# ---------------- TRAINING LOOP ----------------
for epoch in range(1, 301):
    loss = train()
    rmse, r2, pearson_corr, spearman_corr = test()

    if epoch % 10 == 0 or epoch == 1:
        print(
            f"Epoch {epoch:03d} | "
            f"Loss: {loss:.6f} | "
            f"RMSE: {rmse:.6f} | "
            f"R2: {r2:.4f} | "
            f"Pearson: {pearson_corr:.4f} | "
            f"Spearman: {spearman_corr:.4f}"
        )



Epoch 001 | Loss: 0.013800 | RMSE: 0.116596 | R2: -0.0259 | Pearson: -0.0040 | Spearman: 0.0007
Epoch 010 | Loss: 0.013275 | RMSE: 0.115192 | R2: -0.0014 | Pearson: -0.0079 | Spearman: -0.0062
Epoch 020 | Loss: 0.013165 | RMSE: 0.115232 | R2: -0.0021 | Pearson: -0.0107 | Spearman: -0.0068
Epoch 030 | Loss: 0.012893 | RMSE: 0.115075 | R2: 0.0007 | Pearson: 0.0453 | Spearman: 0.0324
Epoch 040 | Loss: 0.012049 | RMSE: 0.112907 | R2: 0.0380 | Pearson: 0.1952 | Spearman: 0.1589
Epoch 050 | Loss: 0.009217 | RMSE: 0.099768 | R2: 0.2489 | Pearson: 0.5009 | Spearman: 0.4683
Epoch 060 | Loss: 0.005311 | RMSE: 0.074935 | R2: 0.5763 | Pearson: 0.7612 | Spearman: 0.6967
Epoch 070 | Loss: 0.004049 | RMSE: 0.065473 | R2: 0.6765 | Pearson: 0.8228 | Spearman: 0.7278
Epoch 080 | Loss: 0.003494 | RMSE: 0.061482 | R2: 0.7147 | Pearson: 0.8455 | Spearman: 0.7473
Epoch 090 | Loss: 0.003185 | RMSE: 0.059505 | R2: 0.7328 | Pearson: 0.8561 | Spearman: 0.7532
Epoch 100 | Loss: 0.002976 | RMSE: 0.058393 | R2: 0.