In [2]:
pip install pandas numpy networkx scikit-learn scipy

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.stats import ttest_rel, wilcoxon, f_oneway

In [4]:
TRUST_PATH = "filmtrust_data/trust.txt" 
OUTPUT_DIR = "outputs_filmtrust"
os.makedirs(OUTPUT_DIR, exist_ok=True)

W_VALUES = [0.6, 0.7, 0.8, 0.9]
EPS = 1e-15

In [5]:
def load_filmtrust(path):
    # FilmTrust format: [TrusterID] [TrusteeID] [Value]
    df = pd.read_csv(path, sep=' ', header=None, names=["u", "v", "label"])
    # Standardizing labels to binary
    df['label'] = (df['label'] >= 1).astype(int)
    return df

df = load_filmtrust(TRUST_PATH)
y = df.label.values
print(f"[OK] Loaded FilmTrust | edges: {len(df)}")

[OK] Loaded FilmTrust | edges: 1853


In [6]:
G = nx.DiGraph()
for _, r in df.iterrows():
    G.add_edge(r.u, r.v)
UG = G.to_undirected()

In [7]:
rows = []
for _, r in df.iterrows():
    # Connectivity features
    cn = len(list(nx.common_neighbors(UG, r.u, r.v))) if UG.has_node(r.u) and UG.has_node(r.v) else 0
    
    try:
        jaccard = next(nx.jaccard_coefficient(UG, [(r.u, r.v)]))[2]
        adamic = next(nx.adamic_adar_index(UG, [(r.u, r.v)]))[2]
    except (nx.NetworkXError, StopIteration):
        jaccard, adamic = 0, 0

    rows.append({
        "u": r.u, "v": r.v, "label": r.label,
        "u_in": G.in_degree(r.u), "u_out": G.out_degree(r.u),
        "v_in": G.in_degree(r.v), "v_out": G.out_degree(r.v),
        "cn": cn, "jaccard": jaccard, "adamic": adamic,
        "pa": G.degree(r.u) * G.degree(r.v)
    })

feature_df = pd.DataFrame(rows).fillna(0)

In [8]:
node_cols = ["u_in", "u_out", "v_in", "v_out"]
link_cols = ["jaccard", "adamic", "pa", "cn"]

# Scaling
for c in node_cols:
    feature_df[c] = np.log1p(feature_df[c])
feature_df[node_cols] = StandardScaler().fit_transform(feature_df[node_cols])

feature_df["pa"] = np.log1p(feature_df["pa"])
feature_df[link_cols] = MinMaxScaler().fit_transform(feature_df[link_cols])

X = feature_df[node_cols + link_cols]

# Reliability logic
if len(np.unique(y)) > 1:
    auc_scores = {c: roc_auc_score(y, X[c]) for c in X.columns}
    mi_vals = mutual_info_classif(X, y, random_state=0)
else:
    # If FilmTrust only has positive trust, we use default weights
    auc_scores = {c: 1.0 for c in X.columns}
    mi_vals = np.ones(len(X.columns))

mi_norm = dict(zip(X.columns, MinMaxScaler().fit_transform(mi_vals.reshape(-1,1)).flatten()))

In [9]:
def sigmoid(x): return 1 / (1 + np.exp(-x))

metrics = []
for w in W_VALUES:
    comp = {f: w * auc_scores[f] + (1 - w) * mi_norm[f] for f in X.columns}
    alpha = {k: v / (sum(comp[f] for f in node_cols) + EPS) for k, v in comp.items() if k in node_cols}
    beta  = {k: v / (sum(comp[f] for f in link_cols) + EPS) for k, v in comp.items() if k in link_cols}

    z = (X[node_cols].values @ np.array(list(alpha.values())) +
         X[link_cols].values @ np.array(list(beta.values())))

    probs = sigmoid(z)
    loss = -(y * np.log(probs + EPS) + (1 - y) * np.log(1 - probs + EPS))

    metrics.append([w, 
                    roc_auc_score(y, probs) if len(np.unique(y)) > 1 else 0,
                    average_precision_score(y, probs) if len(np.unique(y)) > 1 else 0,
                    loss.mean()])

# Save
pd.DataFrame(metrics, columns=["w","AUC","AP","LogLoss"]).to_csv(f"{OUTPUT_DIR}/ft_results.csv", index=False)
print(f"=== PIPELINE COMPLETED ===")

=== PIPELINE COMPLETED ===


In [None]:
# ============================================================
# FilmTrust Trust Prediction Pipeline (With Negative Sampling)
# ============================================================

import os
import random
import numpy as np
import pandas as pd
import networkx as nx

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.stats import ttest_rel, wilcoxon, f_oneway

# ============================================================
# CONFIG
# ============================================================
TRUST_PATH = "filmtrust_data/trust.txt"  # Path from your unzip step
OUTPUT_DIR = "outputs_filmtrust"
os.makedirs(OUTPUT_DIR, exist_ok=True)

W_VALUES = [0.6, 0.7, 0.8, 0.9]
EPS = 1e-15

# ============================================================
# 1. LOAD DATA + NEGATIVE SAMPLING (Fixes AUC=0)
# ============================================================
def load_filmtrust_with_negatives(path):
    # Load actual trust links (Positive Class)
    # FilmTrust is space-separated: Truster Trustee Value
    df_pos = pd.read_csv(path, sep=' ', header=None, names=["u", "v", "label"])
    df_pos['label'] = 1  # All existing links represent trust
    
    # Identify all unique nodes in the network
    all_nodes = list(set(df_pos['u']) | set(df_pos['v']))
    existing_edges = set(zip(df_pos['u'], df_pos['v']))
    
    # Generate Negative Samples (Edges that do NOT exist)
    neg_rows = []
    print("[...] Generating negative samples for balanced evaluation")
    while len(neg_rows) < len(df_pos):
        u, v = random.sample(all_nodes, 2)
        if (u, v) not in existing_edges:
            neg_rows.append({"u": u, "v": v, "label": 0})
            existing_edges.add((u, v))
            
    df_neg = pd.DataFrame(neg_rows)
    # Combine and shuffle
    return pd.concat([df_pos, df_neg]).sample(frac=1).reset_index(drop=True)

df = load_filmtrust_with_negatives(TRUST_PATH)
y = df.label.values
print(f"[OK] Data Loaded | Positives: {sum(y)} | Negatives: {len(y)-sum(y)}")

# ============================================================
# 2. BUILD GRAPH
# ============================================================
G = nx.DiGraph()
# Only build the graph using Positive edges for feature calculation
pos_edges = df[df.label == 1]
for _, r in pos_edges.iterrows():
    G.add_edge(r.u, r.v)

UG = G.to_undirected()

# ============================================================
# 3. FEATURE EXTRACTION
# ============================================================
rows = []
print("[...] Extracting features for all pairs")
for _, r in df.iterrows():
    # Link features with safety checks for node existence
    cn = len(list(nx.common_neighbors(UG, r.u, r.v))) if UG.has_node(r.u) and UG.has_node(r.v) else 0
    
    try:
        jaccard = next(nx.jaccard_coefficient(UG, [(r.u, r.v)]))[2]
        adamic = next(nx.adamic_adar_index(UG, [(r.u, r.v)]))[2]
    except:
        jaccard, adamic = 0, 0

    rows.append({
        "u": r.u, "v": r.v, "label": r.label,
        "u_in":  G.in_degree(r.u) if G.has_node(r.u) else 0,
        "u_out": G.out_degree(r.u) if G.has_node(r.u) else 0,
        "v_in":  G.in_degree(r.v) if G.has_node(r.v) else 0,
        "v_out": G.out_degree(r.v) if G.has_node(r.v) else 0,
        "cn":    cn,
        "jaccard": jaccard,
        "adamic":  adamic,
        "pa":    (G.degree(r.u) if G.has_node(r.u) else 0) * (G.degree(r.v) if G.has_node(r.v) else 0)
    })

feature_df = pd.DataFrame(rows).fillna(0)

# ============================================================
# 4. NORMALIZATION
# ============================================================
node_cols = ["u_in", "u_out", "v_in", "v_out"]
link_cols = ["jaccard", "adamic", "pa", "cn"]

for c in node_cols:
    feature_df[c] = np.log1p(feature_df[c])
feature_df[node_cols] = StandardScaler().fit_transform(feature_df[node_cols])

feature_df["pa"] = np.log1p(feature_df["pa"])
feature_df[link_cols] = MinMaxScaler().fit_transform(feature_df[link_cols])

X = feature_df[node_cols + link_cols]

# ============================================================
# 5. FEATURE RELIABILITY (AUC + MI)
# ============================================================
auc_scores = {c: roc_auc_score(y, X[c]) for c in X.columns}
mi_vals = mutual_info_classif(X, y, random_state=0)
mi_norm = dict(zip(X.columns, MinMaxScaler().fit_transform(mi_vals.reshape(-1,1)).flatten()))

# ============================================================
# 6. TRUST MODEL + METRICS
# ============================================================
def sigmoid(x): return 1 / (1 + np.exp(-x))

results = []
per_edge_losses = {}

for w in W_VALUES:
    # Combine AUC and MI for composite reliability
    comp = {f: w * auc_scores[f] + (1 - w) * mi_norm[f] for f in X.columns}
    
    # Normalize weights for Alpha (nodes) and Beta (links)
    alpha = {f: comp[f] for f in node_cols}
    beta  = {f: comp[f] for f in link_cols}
    alpha = {k: v / (sum(alpha.values()) + EPS) for k, v in alpha.items()}
    beta  = {k: v / (sum(beta.values()) + EPS) for k, v in beta.items()}

    # Weighted sum of features
    z = (X[node_cols].values @ np.array(list(alpha.values())) +
         X[link_cols].values @ np.array(list(beta.values())))

    probs = sigmoid(z)
    loss = -(y * np.log(probs + EPS) + (1 - y) * np.log(1 - probs + EPS))
    per_edge_losses[w] = loss

    results.append([
        w, roc_auc_score(y, probs), average_precision_score(y, probs), loss.mean()
    ])
    print(f"[OK] w={w} processed")

# ============================================================
# 7. SAVE RESULTS
# ============================================================
pd.DataFrame(results, columns=["w", "AUC", "AP", "LogLoss"]).to_csv(f"{OUTPUT_DIR}/ft_final_metrics.csv", index=False)


print(f"\n=== FILMTRUST PIPELINE COMPLETED ===")
print(f"Results saved to {OUTPUT_DIR}/ft_final_metrics.csv")

[...] Generating negative samples for balanced evaluation
[OK] Data Loaded | Positives: 1853 | Negatives: 1853
[...] Extracting features for all pairs
[OK] w=0.6 processed
[OK] w=0.7 processed
[OK] w=0.8 processed
[OK] w=0.9 processed

=== FILMTRUST PIPELINE COMPLETED ===
Results saved to outputs_filmtrust/ft_final_metrics.csv


In [12]:
# ============================================================
# 6. TRUST MODEL + METRICS (FIXED WITH ALPHA/BETA TRACKING)
# ============================================================
def sigmoid(x): return 1 / (1 + np.exp(-x))

metrics = []
alpha_beta_rows = [] # Initialize the list to store weights

for w in W_VALUES:
    # Combine AUC and MI for composite reliability
    comp = {f: w * auc_scores[f] + (1 - w) * mi_norm[f] for f in X.columns}
    
    # Separate and Normalize weights
    alpha = {f: comp[f] for f in node_cols}
    beta  = {f: comp[f] for f in link_cols}
    
    alpha = {k: v / (sum(alpha.values()) + EPS) for k, v in alpha.items()}
    beta  = {k: v / (sum(beta.values()) + EPS) for k, v in beta.items()}

    # Weighted sum calculation
    z = (X[node_cols].values @ np.array(list(alpha.values())) +
         X[link_cols].values @ np.array(list(beta.values())))

    probs = sigmoid(z)
    loss = -(y * np.log(probs + EPS) + (1 - y) * np.log(1 - probs + EPS))

    # Save Performance Metrics
    metrics.append([
        w, roc_auc_score(y, probs), average_precision_score(y, probs), loss.mean()
    ])

    # Save Alpha and Beta Weights for this specific w
    alpha_beta_rows.append({
        "w": w,
        **{f"alpha_{k}": v for k, v in alpha.items()},
        **{f"beta_{k}": v for k, v in beta.items()}
    })
    print(f"[OK] w={w} weights captured")

# ============================================================
# 7. SAVE ALL RESULTS
# ============================================================
# Save performance (AUC, AP, Loss)
pd.DataFrame(metrics, columns=["w", "AUC", "AP", "LogLoss"]).to_csv(
    f"{OUTPUT_DIR}/ft_final_metrics.csv", index=False
)

# Save Weights (The Alphas and Betas you want to see)
pd.DataFrame(alpha_beta_rows).to_csv(
    f"{OUTPUT_DIR}/ft_feature_weights.csv", index=False
)

print(f"\n=== SUCCESS: Weights saved to {OUTPUT_DIR}/ft_feature_weights.csv ===")

[OK] w=0.6 weights captured
[OK] w=0.7 weights captured
[OK] w=0.8 weights captured
[OK] w=0.9 weights captured

=== SUCCESS: Weights saved to outputs_filmtrust/ft_feature_weights.csv ===
