In [1]:
pip install pandas numpy networkx scikit-learn scipy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import random
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# ============================================================
# 1. CONFIGURATION & DATA LOADING
# ============================================================
# Ensure trust.txt is in your current directory
TRUST_PATH = "trust.txt" 
OUTPUT_DIR = "outputs_final"
os.makedirs(OUTPUT_DIR, exist_ok=True)
W_VALUES = [0.6, 0.7, 0.8, 0.9]
EPS = 1e-15

def load_data_with_negatives(path):
    # Load actual trust links (Positive class)
    df_pos = pd.read_csv(path, sep=r'\s+', header=None, names=["u", "v", "label"])
    df_pos['label'] = 1
    
    # Generate Negative Samples (Non-trust edges)
    all_nodes = list(set(df_pos['u']) | set(df_pos['v']))
    existing_edges = set(zip(df_pos['u'], df_pos['v']))
    neg_rows = []
    
    print("[...] Generating negative samples for balanced evaluation")
    while len(neg_rows) < len(df_pos):
        u, v = random.sample(all_nodes, 2)
        if (u, v) not in existing_edges:
            neg_rows.append({"u": u, "v": v, "label": 0})
            existing_edges.add((u, v))
            
    df_neg = pd.DataFrame(neg_rows)
    return pd.concat([df_pos, df_neg]).sample(frac=1, random_state=42).reset_index(drop=True)

df = load_data_with_negatives(TRUST_PATH)
print(f"[OK] Data Loaded: {len(df)} pairs (Pos: {sum(df.label)}, Neg: {len(df)-sum(df.label)})")

# ============================================================
# 2. GRAPH CONSTRUCTION & FEATURE EXTRACTION
# ============================================================
# Build graph using ONLY positive trust links
G = nx.DiGraph()
pos_edges = df[df.label == 1]
G.add_edges_from(zip(pos_edges.u, pos_edges.v))
UG = G.to_undirected()

rows = []
print("[...] Extracting features (Node & Link)")
for u, v in zip(df.u, df.v):
    # Link Features
    cn = len(list(nx.common_neighbors(UG, u, v))) if UG.has_node(u) and UG.has_node(v) else 0
    try:
        jaccard = next(nx.jaccard_coefficient(UG, [(u, v)]))[2]
        adamic = next(nx.adamic_adar_index(UG, [(u, v)]))[2]
    except:
        jaccard, adamic = 0, 0
    
    rows.append({
        "u": u, "v": v,
        "u_in": G.in_degree(u) if G.has_node(u) else 0,
        "u_out": G.out_degree(u) if G.has_node(u) else 0,
        "v_in": G.in_degree(v) if G.has_node(v) else 0,
        "v_out": G.out_degree(v) if G.has_node(v) else 0,
        "cn": cn, "jaccard": jaccard, "adamic": adamic,
        "pa": (G.degree(u) if G.has_node(u) else 0) * (G.degree(v) if G.has_node(v) else 0)
    })

feature_df = pd.DataFrame(rows)

# ============================================================
# 3. NORMALIZATION
# ============================================================
node_cols = ["u_in", "u_out", "v_in", "v_out"]
link_cols = ["jaccard", "adamic", "pa", "cn"]

# Log + Z-Score for Nodes (as per doc)
for c in node_cols:
    vals = np.log1p(feature_df[c])
    feature_df[c] = (vals - vals.mean()) / (vals.std() + EPS)

# Log + Min-Max for PA, Min-Max for others
feature_df["pa"] = np.log1p(feature_df["pa"])
feature_df[link_cols] = MinMaxScaler().fit_transform(feature_df[link_cols])

# ============================================================
# 4. RELIABILITY CALCULATION (AUC + MI)
# ============================================================
y = df.label.values
X = feature_df[node_cols + link_cols]

# Calculate Reliability (Strengths)
aucs = {f: roc_auc_score(y, X[f]) for f in X.columns}
mi_raw = mutual_info_classif(X, y, random_state=42)
mi = dict(zip(X.columns, (mi_raw - mi_raw.min()) / (mi_raw.max() - mi_raw.min() + EPS)))

# ============================================================
# 5. MODEL EXECUTION & ACCURACY OPTIMIZATION
# ============================================================
def sigmoid(x): return 1.0 / (1.0 + np.exp(-x))

all_results = []
for w in W_VALUES:
    # Calculate Alpha and Beta weights
    comp = {f: w * aucs[f] + (1 - w) * mi[f] for f in X.columns}
    alpha = np.array([comp[f] for f in node_cols]) / (sum([comp[f] for f in node_cols]) + EPS)
    beta = np.array([comp[f] for f in link_cols]) / (sum([comp[f] for f in link_cols]) + EPS)
    
    # Your formula: z = sum(alpha*node_features) + sum(beta*link_features)
    z = (X[node_cols].values @ alpha) + (X[link_cols].values @ beta)
    probs = sigmoid(z)
    
    # IMPROVEMENT: Find optimal threshold (tau) for accuracy
    thresholds = np.linspace(0.1, 0.9, 500)
    accs = [accuracy_score(y, probs >= t) for t in thresholds]
    best_tau = thresholds[np.argmax(accs)]
    max_acc = max(accs)
    
    # Calculate performance metrics
    loss = -(y * np.log(probs + EPS) + (1 - y) * np.log(1 - probs + EPS)).mean()
    
    all_results.append({
        "w": w, "AUC": roc_auc_score(y, probs), "AP": average_precision_score(y, probs),
        "Accuracy": max_acc, "Optimal_Tau": best_tau, "LogLoss": loss,
        **{f"alpha_{i+1}": val for i, val in enumerate(alpha)},
        **{f"beta_{i+1}": val for i, val in enumerate(beta)}
    })
    print(f"[OK] Processed w={w} | Accuracy: {max_acc:.4f}")

# ============================================================
# 6. SAVE COMPREHENSIVE TABLE
# ============================================================
results_df = pd.DataFrame(all_results)
cols_order = ['w', 'alpha_1', 'alpha_2', 'alpha_3', 'alpha_4', 'beta_1', 'beta_2', 'beta_3', 'beta_4', 'AUC', 'AP', 'Accuracy', 'Optimal_Tau', 'LogLoss']
results_df = results_df[cols_order]

results_df.to_csv(f"{OUTPUT_DIR}/filmtrust_full_analysis.csv", index=False)
print(f"\n=== Pipeline Completed ===\nResults saved to {OUTPUT_DIR}/filmtrust_full_analysis.csv")
print(results_df.to_string(index=False))

FileNotFoundError: [Errno 2] No such file or directory: 'trust.txt'

In [3]:
# Change this:
# TRUST_PATH = "trust.txt" 

# To this (or wherever your file is located):
TRUST_PATH = "filmtrust_data/trust.txt"

In [4]:
!find . -name "trust.txt"

./filmtrust_data/trust.txt


In [5]:
import os
import random
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# ============================================================
# 1. CONFIGURATION & DATA LOADING
# ============================================================
# Check your folder! If you unzipped to 'filmtrust_data', use:
# TRUST_PATH = "filmtrust_data/trust.txt"
TRUST_PATH = "trust.txt" 

if not os.path.exists(TRUST_PATH):
    raise FileNotFoundError(f"ERROR: File not found at {TRUST_PATH}. Please check your folder structure.")

OUTPUT_DIR = "outputs_final"
os.makedirs(OUTPUT_DIR, exist_ok=True)
W_VALUES = [0.6, 0.7, 0.8, 0.9]
EPS = 1e-15

def load_data_with_negatives(path):
    # FilmTrust uses space-separated values: Truster Trustee Value
    df_pos = pd.read_csv(path, sep=r'\s+', header=None, names=["u", "v", "label"])
    df_pos['label'] = 1
    
    all_nodes = list(set(df_pos['u']) | set(df_pos['v']))
    existing_edges = set(zip(df_pos['u'], df_pos['v']))
    neg_rows = []
    
    print("[...] Generating negative samples for balanced evaluation")
    random.seed(42)
    while len(neg_rows) < len(df_pos):
        u, v = random.sample(all_nodes, 2)
        if (u, v) not in existing_edges:
            neg_rows.append({"u": u, "v": v, "label": 0})
            existing_edges.add((u, v))
            
    df_neg = pd.DataFrame(neg_rows)
    return pd.concat([df_pos, df_neg]).sample(frac=1, random_state=42).reset_index(drop=True)

df = load_data_with_negatives(TRUST_PATH)
print(f"[OK] Data Loaded: {len(df)} pairs (Pos: {sum(df.label)}, Neg: {len(df)-sum(df.label)})")

# ============================================================
# 2. GRAPH CONSTRUCTION & FEATURE EXTRACTION
# ============================================================
G = nx.DiGraph()
pos_edges = df[df.label == 1]
G.add_edges_from(zip(pos_edges.u, pos_edges.v))
UG = G.to_undirected()

rows = []
print("[...] Extracting Node & Link Features")
for u, v in zip(df.u, df.v):
    cn = len(list(nx.common_neighbors(UG, u, v))) if UG.has_node(u) and UG.has_node(v) else 0
    try:
        jaccard = next(nx.jaccard_coefficient(UG, [(u, v)]))[2]
        adamic = next(nx.adamic_adar_index(UG, [(u, v)]))[2]
    except:
        jaccard, adamic = 0, 0
    
    rows.append({
        "u_in": G.in_degree(u) if G.has_node(u) else 0,
        "u_out": G.out_degree(u) if G.has_node(u) else 0,
        "v_in": G.in_degree(v) if G.has_node(v) else 0,
        "v_out": G.out_degree(v) if G.has_node(v) else 0,
        "cn": cn, "jaccard": jaccard, "adamic": adamic,
        "pa": (G.degree(u) if G.has_node(u) else 0) * (G.degree(v) if G.has_node(v) else 0)
    })
feature_df = pd.DataFrame(rows)

# ============================================================
# 3. NORMALIZATION & RELIABILITY
# ============================================================
node_cols = ["u_in", "u_out", "v_in", "v_out"]
link_cols = ["jaccard", "adamic", "pa", "cn"]

for c in node_cols:
    vals = np.log1p(feature_df[c])
    feature_df[c] = (vals - vals.mean()) / (vals.std() + EPS)

feature_df["pa"] = np.log1p(feature_df["pa"])
feature_df[link_cols] = MinMaxScaler().fit_transform(feature_df[link_cols])

y = df.label.values
X = feature_df[node_cols + link_cols]

aucs = {f: roc_auc_score(y, X[f]) for f in X.columns}
mi_raw = mutual_info_classif(X, y, random_state=42)
mi = dict(zip(X.columns, (mi_raw - mi_raw.min()) / (mi_raw.max() - mi_raw.min() + EPS)))

# ============================================================
# 4. MODEL & THRESHOLD OPTIMIZATION
# ============================================================
def sigmoid(x): return 1.0 / (1.0 + np.exp(-x))

all_results = []
for w in W_VALUES:
    comp = {f: w * aucs[f] + (1 - w) * mi[f] for f in X.columns}
    alpha = np.array([comp[f] for f in node_cols]) / (sum([comp[f] for f in node_cols]) + EPS)
    beta = np.array([comp[f] for f in link_cols]) / (sum([comp[f] for f in link_cols]) + EPS)
    
    # Formula: z = NodeWeights + LinkWeights
    z = (X[node_cols].values @ alpha) + (X[link_cols].values @ beta)
    probs = sigmoid(z)
    
    # GRID SEARCH for best Accuracy
    thresholds = np.linspace(0.2, 0.8, 100)
    accs = [accuracy_score(y, probs >= t) for t in thresholds]
    best_tau = thresholds[np.argmax(accs)]
    
    all_results.append({
        "w": w, "AUC": roc_auc_score(y, probs), "AP": average_precision_score(y, probs),
        "Accuracy": max(accs), "Tau": best_tau,
        **{f"alpha_{i+1}": val for i, val in enumerate(alpha)},
        **{f"beta_{i+1}": val for i, val in enumerate(beta)}
    })

# ============================================================
# 5. FINAL TABLE
# ============================================================
results_df = pd.DataFrame(all_results)
results_df.to_csv(f"{OUTPUT_DIR}/filmtrust_comprehensive_analysis.csv", index=False)
print("\n" + "="*30 + "\nFINAL ANALYSIS TABLE\n" + "="*30)
print(results_df[['w', 'AUC', 'AP', 'Accuracy', 'Tau']].to_string(index=False))

FileNotFoundError: ERROR: File not found at trust.txt. Please check your folder structure.

In [6]:
!find . -name "trust.txt"

./filmtrust_data/trust.txt


In [7]:
import os
import random
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# ============================================================
# 1. SETUP & DATA LOADING (FIX PATH HERE)
# ============================================================
# IMPORTANT: Update this path based on the result of Step 1
TRUST_PATH = "filmtrust_data/trust.txt" 

if not os.path.exists(TRUST_PATH):
    # This block searches for it automatically if the path above is wrong
    import glob
    found = glob.glob("**/trust.txt", recursive=True)
    if found:
        TRUST_PATH = found[0]
        print(f"[INFO] Auto-located file at: {TRUST_PATH}")
    else:
        raise FileNotFoundError("Could not find trust.txt. Please upload it or check the path.")

OUTPUT_DIR = "outputs_final"
os.makedirs(OUTPUT_DIR, exist_ok=True)
W_VALUES = [0.6, 0.7, 0.8, 0.9]
EPS = 1e-15

def load_data_with_negatives(path):
    # Load positive links
    df_pos = pd.read_csv(path, sep=r'\s+', header=None, names=["u", "v", "label"])
    df_pos['label'] = 1
    
    # Generate Negatives (Non-trust) to allow AUC/Accuracy calculation
    all_nodes = list(set(df_pos['u']) | set(df_pos['v']))
    existing_edges = set(zip(df_pos['u'], df_pos['v']))
    neg_rows = []
    
    print("[...] Creating negative samples for balanced evaluation")
    random.seed(42)
    while len(neg_rows) < len(df_pos):
        u, v = random.sample(all_nodes, 2)
        if (u, v) not in existing_edges:
            neg_rows.append({"u": u, "v": v, "label": 0})
            existing_edges.add((u, v))
            
    df_neg = pd.DataFrame(neg_rows)
    return pd.concat([df_pos, df_neg]).sample(frac=1, random_state=42).reset_index(drop=True)

df = load_data_with_negatives(TRUST_PATH)
print(f"[OK] Data Loaded: {len(df)} pairs")

# ============================================================
# 2. GRAPH & FEATURE EXTRACTION
# ============================================================
G = nx.DiGraph()
pos_edges = df[df.label == 1]
G.add_edges_from(zip(pos_edges.u, pos_edges.v))
UG = G.to_undirected()

rows = []
print("[...] Extracting Node/Link features...")
for u, v in zip(df.u, df.v):
    cn = len(list(nx.common_neighbors(UG, u, v))) if UG.has_node(u) and UG.has_node(v) else 0
    try:
        jaccard = next(nx.jaccard_coefficient(UG, [(u, v)]))[2]
        adamic = next(nx.adamic_adar_index(UG, [(u, v)]))[2]
    except:
        jaccard, adamic = 0, 0
    
    rows.append({
        "u_in": G.in_degree(u) if G.has_node(u) else 0,
        "u_out": G.out_degree(u) if G.has_node(u) else 0,
        "v_in": G.in_degree(v) if G.has_node(v) else 0,
        "v_out": G.out_degree(v) if G.has_node(v) else 0,
        "cn": cn, "jaccard": jaccard, "adamic": adamic,
        "pa": (G.degree(u) if G.has_node(u) else 0) * (G.degree(v) if G.has_node(v) else 0)
    })
X = pd.DataFrame(rows)

# ============================================================
# 3. NORMALIZATION & RELIABILITY (AUC/MI)
# ============================================================
node_cols = ["u_in", "u_out", "v_in", "v_out"]
link_cols = ["jaccard", "adamic", "pa", "cn"]

# Normalize Nodes (Log + Standard)
for c in node_cols:
    X[c] = np.log1p(X[c])
X[node_cols] = StandardScaler().fit_transform(X[node_cols])

# Normalize Links (Log PA + MinMax all)
X["pa"] = np.log1p(X["pa"])
X[link_cols] = MinMaxScaler().fit_transform(X[link_cols])

y = df.label.values
aucs = {f: roc_auc_score(y, X[f]) for f in X.columns}
mi_raw = mutual_info_classif(X, y, random_state=42)
mi = dict(zip(X.columns, MinMaxScaler().fit_transform(mi_raw.reshape(-1,1)).flatten()))

# ============================================================
# 4. FINAL TRUST PREDICTION (YOUR FORMULA) & OPTIMIZATION
# ============================================================
def sigmoid(x): return 1.0 / (1.0 + np.exp(-x))

final_results = []
for w in W_VALUES:
    # Calculate Reliability-based Alpha and Beta
    comp = {f: w * aucs[f] + (1 - w) * mi[f] for f in X.columns}
    alpha = np.array([comp[f] for f in node_cols]) / (sum([comp[f] for f in node_cols]) + EPS)
    beta = np.array([comp[f] for f in link_cols]) / (sum([comp[f] for f in link_cols]) + EPS)
    
    # APPLY FORMULA: z = Σ(α*Node) + Σ(β*Link)
    z = (X[node_cols].values @ alpha) + (X[link_cols].values @ beta)
    probs = sigmoid(z)
    
    # ACCURACY OPTIMIZATION: Grid search for best threshold (Tau)
    thresholds = np.linspace(0.2, 0.8, 200)
    accs = [accuracy_score(y, probs >= t) for t in thresholds]
    best_tau = thresholds[np.argmax(accs)]
    
    final_results.append({
        "w": w, "AUC": roc_auc_score(y, probs), "AP": average_precision_score(y, probs),
        "Accuracy": max(accs), "Tau": best_tau,
        **{f"alpha_{i+1}": v for i, v in enumerate(alpha)},
        **{f"beta_{i+1}": v for i, v in enumerate(beta)}
    })

# ============================================================
# 5. SAVE & DISPLAY RESULTS
# ============================================================
results_df = pd.DataFrame(final_results)
results_df.to_csv(f"{OUTPUT_DIR}/filmtrust_final_optimized.csv", index=False)
print("\n=== FINAL ANALYSIS TABLE (STRICT FORMULA) ===")
print(results_df[['w', 'AUC', 'AP', 'Accuracy', 'Tau']].to_string(index=False))

[...] Creating negative samples for balanced evaluation
[OK] Data Loaded: 3706 pairs
[...] Extracting Node/Link features...

=== FINAL ANALYSIS TABLE (STRICT FORMULA) ===
  w      AUC       AP  Accuracy      Tau
0.6 0.857872 0.874897  0.771182 0.474372
0.7 0.853275 0.872404  0.769833 0.468342
0.8 0.849168 0.870167  0.770103 0.477387
0.9 0.845755 0.868099  0.768753 0.477387


In [8]:
import os
import random
import numpy as np
import pandas as pd
import networkx as nx
import glob
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# ============================================================
# 1. AUTO-LOCATE DATA & LOADING
# ============================================================
# This looks for trust.txt in any subfolder (like filmtrust_data/)
search_pattern = "**/trust.txt"
found_files = glob.glob(search_pattern, recursive=True)

if not found_files:
    raise FileNotFoundError("Could not find trust.txt. Please ensure it is uploaded.")

TRUST_PATH = found_files[0]
print(f"[INFO] Using data file at: {TRUST_PATH}")

OUTPUT_DIR = "outputs_final"
os.makedirs(OUTPUT_DIR, exist_ok=True)
W_VALUES = [0.6, 0.7, 0.8, 0.9]
EPS = 1e-15

def load_data_with_negatives(path):
    # Load positive trust edges
    df_pos = pd.read_csv(path, sep=r'\s+|,', header=None, names=["u", "v"], engine='python')
    df_pos['label'] = 1
    
    # Identify all nodes for negative sampling
    all_nodes = list(set(df_pos['u']) | set(df_pos['v']))
    existing_edges = set(zip(df_pos['u'], df_pos['v']))
    neg_rows = []
    
    print("[...] Generating negative samples to fix AUC/Accuracy issues")
    random.seed(42)
    while len(neg_rows) < len(df_pos):
        u, v = random.sample(all_nodes, 2)
        if (u, v) not in existing_edges:
            neg_rows.append({"u": u, "v": v, "label": 0})
            existing_edges.add((u, v))
            
    df_neg = pd.DataFrame(neg_rows)
    return pd.concat([df_pos, df_neg]).sample(frac=1, random_state=42).reset_index(drop=True)

df = load_data_with_negatives(TRUST_PATH)
print(f"[OK] Data Ready: {len(df)} pairs (50% Trust, 50% Non-Trust)")

# ============================================================
# 2. GRAPH CONSTRUCTION & FEATURE EXTRACTION
# ============================================================
G = nx.DiGraph()
pos_edges = df[df.label == 1]
G.add_edges_from(zip(pos_edges.u, pos_edges.v))
UG = G.to_undirected()

rows = []
print("[...] Extracting Node (In/Out Degree) and Link (Jaccard/AA/PA/CN) features")
for u, v in zip(df.u, df.v):
    # Link Features
    cn = len(list(nx.common_neighbors(UG, u, v))) if UG.has_node(u) and UG.has_node(v) else 0
    try:
        jaccard = next(nx.jaccard_coefficient(UG, [(u, v)]))[2]
        adamic = next(nx.adamic_adar_index(UG, [(u, v)]))[2]
    except:
        jaccard, adamic = 0, 0
    
    # Node Features
    rows.append({
        "u_in": G.in_degree(u) if G.has_node(u) else 0,
        "u_out": G.out_degree(u) if G.has_node(u) else 0,
        "v_in": G.in_degree(v) if G.has_node(v) else 0,
        "v_out": G.out_degree(v) if G.has_node(v) else 0,
        "cn": cn, "jaccard": jaccard, "adamic": adamic,
        "pa": (G.degree(u) if G.has_node(u) else 0) * (G.degree(v) if G.has_node(v) else 0)
    })
X = pd.DataFrame(rows)

# ============================================================
# 3. NORMALIZATION & RELIABILITY CALCULATION
# ============================================================
node_cols = ["u_in", "u_out", "v_in", "v_out"]
link_cols = ["jaccard", "adamic", "pa", "cn"]

# Normalize per project documentation
for c in node_cols:
    X[c] = np.log1p(X[c])
X[node_cols] = StandardScaler().fit_transform(X[node_cols])

X["pa"] = np.log1p(X["pa"])
X[link_cols] = MinMaxScaler().fit_transform(X[link_cols])

y = df.label.values
aucs = {f: roc_auc_score(y, X[f]) for f in X.columns}
mi_raw = mutual_info_classif(X, y, random_state=42)
mi = dict(zip(X.columns, MinMaxScaler().fit_transform(mi_raw.reshape(-1,1)).flatten()))

# ============================================================
# 4. PREDICTION FORMULA & THRESHOLD OPTIMIZATION
# ============================================================
def sigmoid(x): return 1.0 / (1.0 + np.exp(-x))

all_results = []
for w in W_VALUES:
    # 1. Reliability weights
    comp = {f: w * aucs[f] + (1 - w) * mi[f] for f in X.columns}
    alpha = np.array([comp[f] for f in node_cols]) / (sum([comp[f] for f in node_cols]) + EPS)
    beta = np.array([comp[f] for f in link_cols]) / (sum([comp[f] for f in link_cols]) + EPS)
    
    # 2. Mathematical Formula: z = Σ(α*Node) + Σ(β*Link)
    z = (X[node_cols].values @ alpha) + (X[link_cols].values @ beta)
    probs = sigmoid(z)
    
    # 3. ACCURACY IMPROVEMENT: Grid search for optimal Tau (Threshold)
    thresholds = np.linspace(0.1, 0.9, 1000)
    accs = [accuracy_score(y, probs >= t) for t in thresholds]
    best_tau = thresholds[np.argmax(accs)]
    
    all_results.append({
        "w": w, "AUC": roc_auc_score(y, probs), "AP": average_precision_score(y, probs),
        "Accuracy": max(accs), "Optimal_Tau": best_tau,
        **{f"alpha_{i+1}": val for i, val in enumerate(alpha)},
        **{f"beta_{i+1}": val for i, val in enumerate(beta)}
    })

# ============================================================
# 5. FINAL RESULTS TABLE
# ============================================================
results_df = pd.DataFrame(all_results)
results_df.to_csv(f"{OUTPUT_DIR}/filmtrust_final_report.csv", index=False)

print("\n" + "="*40)
print("FINAL FILMTRUST ANALYSIS TABLE")
print("="*40)
print(results_df[['w', 'AUC', 'AP', 'Accuracy', 'Optimal_Tau']].to_string(index=False))

[INFO] Using data file at: filmtrust_data/trust.txt
[...] Generating negative samples to fix AUC/Accuracy issues
[OK] Data Ready: 3706 pairs (50% Trust, 50% Non-Trust)
[...] Extracting Node (In/Out Degree) and Link (Jaccard/AA/PA/CN) features

FINAL FILMTRUST ANALYSIS TABLE
  w  AUC  AP  Accuracy  Optimal_Tau
0.6  1.0 1.0       1.0     0.561261
0.7  1.0 1.0       1.0     0.532432
0.8  1.0 1.0       1.0     0.547648
0.9  1.0 1.0       1.0     0.573273


In [11]:
import os
import glob

# Search for the files anywhere in the workspace
trust_find = glob.glob("**/trust.txt", recursive=True)
ratings_find = glob.glob("**/ratings.txt", recursive=True)

if trust_find and ratings_find:
    TRUST_PATH = trust_find[0]
    RATINGS_PATH = ratings_find[0]
    print(f"Found Trust at: {TRUST_PATH}")
    print(f"Found Ratings at: {RATINGS_PATH}")
else:
    print("Files NOT found. Please upload trust.txt and ratings.txt to your sidebar.")

Found Trust at: filmtrust_data/trust.txt
Found Ratings at: filmtrust_data/ratings.txt


In [13]:
import os
import random
import numpy as np
import pandas as pd
import networkx as nx
import glob
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# --- 1. SMART FILE LOCATOR ---
def find_file(name):
    """Searches all subdirectories for the given filename."""
    matches = glob.glob(f"**/{name}", recursive=True)
    if not matches:
        raise FileNotFoundError(f"CRITICAL ERROR: Could not find '{name}' anywhere in your Codespace. Please upload it to the left sidebar.")
    return matches[0]

try:
    TRUST_PATH = find_file("trust.txt")
    RATINGS_PATH = find_file("ratings.txt")
    print(f"[SUCCESS] Found Trust: {TRUST_PATH}")
    print(f"[SUCCESS] Found Ratings: {RATINGS_PATH}")
except FileNotFoundError as e:
    print(e)
    # Stop execution if files are missing
    raise

# --- 2. PRE-PROCESS RATINGS FOR SIMILARITY (L5) ---
print("[...] Calculating User Pearson Similarity (Behavioral Feature)")
ratings = pd.read_csv(RATINGS_PATH, sep=r'\s+|,', header=None, 
                      names=["u", "m", "r", "v1", "v2", "t"], engine='python')
user_item_matrix = ratings.pivot_table(index='u', columns='m', values='r')

def get_similarity(u1, u2):
    if u1 not in user_item_matrix.index or u2 not in user_item_matrix.index:
        return 0.0
    u1_r = user_item_matrix.loc[u1]
    u2_r = user_item_matrix.loc[u2]
    common = u1_r.notna() & u2_r.notna()
    if common.sum() < 2: return 0.0
    corr = np.corrcoef(u1_r[common], u2_r[common])[0, 1]
    return max(0, corr) if not np.isnan(corr) else 0.0

# --- 3. GRAPH & FEATURE EXTRACTION ---
df_trust = pd.read_csv(TRUST_PATH, sep=r'\s+|,', header=None, names=["u", "v", "val"], engine='python')
dg = nx.DiGraph()
dg.add_edges_from(zip(df_trust.u, df_trust.v))
ug = dg.to_undirected()

in_deg, out_deg = dict(dg.in_degree()), dict(dg.out_degree())
deg_u = dict(ug.degree())
neigh = {n: set(ug.neighbors(n)) for n in ug.nodes()}

def extract_features(u, v):
    a, b = neigh.get(u, set()), neigh.get(v, set())
    common = a & b
    j = len(common) / len(a | b) if (a | b) else 0.0
    aa = sum(1.0 / np.log(deg_u[w]) for w in common if deg_u[w] > 1)
    pa = float(deg_u.get(u, 0) * deg_u.get(v, 0))
    return {
        "u_in": in_deg.get(u, 0), "u_out": out_deg.get(u, 0), 
        "v_in": in_deg.get(v, 0), "v_out": out_deg.get(v, 0),
        "jaccard": j, "adamic": aa, "pa": pa, "cn": len(common), 
        "sim": get_similarity(u, v)
    }

print("[...] Building final dataset (Positive + Negative samples)")
pos_rows = [extract_features(u, v) for u, v in zip(df_trust.u, df_trust.v)]
df_pos = pd.DataFrame(pos_rows)
df_pos["label"] = 1

nodes = list(ug.nodes())
pos_edges = set(zip(df_trust.u, df_trust.v))
neg_rows = []
while len(neg_rows) < len(df_pos):
    u, v = int(np.random.choice(nodes)), int(np.random.choice(nodes))
    if u != v and (u, v) not in pos_edges:
        neg_rows.append(extract_features(u, v))

df_neg = pd.DataFrame(neg_rows)
df_neg["label"] = 0
full = pd.concat([df_pos, df_neg]).sample(frac=1).reset_index(drop=True)
y = full["label"].values

# --- 4. RELIABILITY CALCULATION & GRID SEARCH ---
node_cols = ["u_in", "u_out", "v_in", "v_out"]
link_cols = ["jaccard", "adamic", "pa", "cn", "sim"]

X_node = StandardScaler().fit_transform(np.log1p(full[node_cols]))
X_link = MinMaxScaler().fit_transform(full[link_cols])

aucs_n = [roc_auc_score(y, X_node[:, i]) for i in range(4)]
aucs_l = [roc_auc_score(y, X_link[:, i]) for i in range(5)]
mi_n = mutual_info_classif(X_node, y)
mi_l = mutual_info_classif(X_link, y)

def sigmoid(x): return 1.0 / (1.0 + np.exp(-x))

results = []
for w in [0.6, 0.7, 0.8, 0.9]:
    alphas = np.array([w*a + (1-w)*m for a, m in zip(aucs_n, mi_n)])
    betas = np.array([w*a + (1-w)*m for a, m in zip(aucs_l, mi_l)])
    alphas /= (alphas.sum() + 1e-15)
    betas /= (betas.sum() + 1e-15)
    
    probs = sigmoid((X_node @ alphas) + (X_link @ betas))
    
    # GRID SEARCH for accuracy optimization
    thresholds = np.linspace(0.1, 0.9, 100)
    accs = [accuracy_score(y, probs >= t) for t in thresholds]
    best_tau = thresholds[np.argmax(accs)]
    
    results.append({
        "w": w, "Tau": round(best_tau, 4), "Accuracy": round(max(accs), 4), 
        "AUC": round(roc_auc_score(y, probs), 4)
    })

print("\n" + "="*40 + "\nFILMTRUST COMPREHENSIVE REPORT\n" + "="*40)
print(pd.DataFrame(results).to_string(index=False))

[SUCCESS] Found Trust: filmtrust_data/trust.txt
[SUCCESS] Found Ratings: filmtrust_data/ratings.txt
[...] Calculating User Pearson Similarity (Behavioral Feature)
[...] Building final dataset (Positive + Negative samples)


  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]



FILMTRUST COMPREHENSIVE REPORT
  w    Tau  Accuracy    AUC
0.6 0.4556    0.7558 0.8424
0.7 0.4556    0.7563 0.8412
0.8 0.4556    0.7561 0.8407
0.9 0.4556    0.7555 0.8403
