In [None]:
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)

def load_feature_table(path):
    df = pd.read_csv(path, engine='c')
    names = df.iloc[:, 0].astype(str).str.strip().tolist()
    X = df.iloc[:, 1:].astype(np.float32).to_numpy()
    return names, X

def make_pairs_and_labels(circ_names, mir_names, cmi_df, circ_feats, mir_feats, hard_neg_k=10):
    circ_idx = {n:i for i,n in enumerate(circ_names)}
    mir_idx = {n:i for i,n in enumerate(mir_names)}
    mir_stripped = cmi_df['miRNA'].astype(str).str.strip()
    circ_stripped = cmi_df['circRNA'].astype(str).str.strip()
    valid_mask = mir_stripped.isin(mir_idx.keys()) & circ_stripped.isin(circ_idx.keys())
    pos_pairs = [(circ_idx[circ_stripped.iloc[i]], mir_idx[mir_stripped.iloc[i]]) 
                for i in range(len(cmi_df)) if valid_mask.iloc[i]]

    pos_set = set(pos_pairs)
    num_negatives = len(pos_pairs)

    circ_norm = circ_feats / (np.linalg.norm(circ_feats, axis=1, keepdims=True) + 1e-8)
    mir_norm = mir_feats / (np.linalg.norm(mir_feats, axis=1, keepdims=True) + 1e-8)

    neg_pairs = []
    for ci, _ in pos_pairs:
        sims = np.dot(mir_norm, circ_norm[ci])
        hard_candidates = np.argsort(sims)[::-1]
        for mi in hard_candidates[:hard_neg_k]:
            if (ci, mi) not in pos_set and (ci, mi) not in neg_pairs:
                neg_pairs.append((ci, mi))
                break
        if len(neg_pairs) >= num_negatives:
            break

    attempts = 0
    while len(neg_pairs) < num_negatives and attempts < num_negatives * 5:
        ci = rng.integers(0, len(circ_names))
        mi = rng.integers(0, len(mir_names))
        if (ci, mi) not in pos_set and (ci, mi) not in neg_pairs:
            neg_pairs.append((ci, mi))
        attempts += 1

    y_list = [1] * len(pos_pairs) + [0] * len(neg_pairs)
    pair_circ_names = [circ_names[ci] for ci, mi in pos_pairs] + [circ_names[ci] for ci, mi in neg_pairs]
    pair_mir_names = [mir_names[mi] for ci, mi in pos_pairs] + [mir_names[mi] for ci, mi in neg_pairs]
    return (pos_pairs, neg_pairs, np.array(pair_circ_names), np.array(pair_mir_names), np.array(y_list, dtype=int))
