# Factual Recall vs Multi-Hop Reasoning: Motif Profile Comparison

**Core question**: Do single-hop factual recall and multi-hop reasoning leave different structural fingerprints in their attribution graphs?

We compare:
- **Factual recall**: `michael-clt-clean` — *"Fact: Michael Jordan plays the sport of"* (pure single-hop entity→attribute lookup)
- **Multi-hop reasoning**: `capital-state-dallas` — *"Fact: the capital of the state containing Dallas is"* (requires Dallas→Texas→Austin)

If chain enrichment (021C) drops for single-hop while the feedforward loop (030T) stays, and fan-out (021D) flips from anti-enriched to enriched, that's the "structural fingerprint" story.

In [None]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
sys.path.insert(0, os.path.abspath('..'))

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from pathlib import Path

from src.graph_loader import load_attribution_graph, graph_summary
from src.motif_census import (
    compute_motif_census, motif_frequencies, enriched_motifs,
    TRIAD_LABELS, CONNECTED_TRIAD_INDICES,
    MOTIF_FAN_IN, MOTIF_FAN_OUT, MOTIF_CHAIN, MOTIF_FFL, MOTIF_CYCLE,
)
from src.null_model import generate_configuration_null, NullModelResult
from src.comparison import build_task_profile, pairwise_comparison

sns.set_style('whitegrid')
sns.set_context('notebook', font_scale=1.1)

FIGURES_DIR = Path('..') / 'figures'
FIGURES_DIR.mkdir(exist_ok=True)

print('Setup complete.')

## 1. Load and Explore Both Graphs

In [None]:
# Load both graphs
DATA_DIR = Path('..') / 'data' / 'raw'

g_factual = load_attribution_graph(DATA_DIR / 'factual_recall' / 'michael-clt-clean.json')
g_multihop = load_attribution_graph(DATA_DIR / 'multihop' / 'capital-state-dallas.json')

print('=== Factual Recall: michael-clt-clean ===')
s1 = graph_summary(g_factual)
for k, v in s1.items():
    if k not in ('node_type_counts', 'layer_counts'):
        print(f'  {k}: {v}')
print(f'  node_types: {s1["node_type_counts"]}')

print()
print('=== Multi-Hop: capital-state-dallas ===')
s2 = graph_summary(g_multihop)
for k, v in s2.items():
    if k not in ('node_type_counts', 'layer_counts'):
        print(f'  {k}: {v}')
print(f'  node_types: {s2["node_type_counts"]}')

## 2. Raw Triad Census Comparison

In [None]:
census_factual = compute_motif_census(g_factual, size=3)
census_multihop = compute_motif_census(g_multihop, size=3)

# Connected triads only
labels_connected = [TRIAD_LABELS[i] for i in CONNECTED_TRIAD_INDICES]
counts_f = [census_factual.raw_counts[i] for i in CONNECTED_TRIAD_INDICES]
counts_m = [census_multihop.raw_counts[i] for i in CONNECTED_TRIAD_INDICES]

print('Raw Triad Counts (connected only):')
print(f'{"Triad":>8}  {"Factual":>10}  {"Multihop":>10}  {"Ratio":>8}')
print('-' * 42)
for label, cf, cm in zip(labels_connected, counts_f, counts_m):
    ratio = cf / cm if cm > 0 else float('inf')
    print(f'{label:>8}  {cf:>10}  {cm:>10}  {ratio:>8.2f}')

In [None]:
# Side-by-side raw counts bar chart
fig, axes = plt.subplots(1, 2, figsize=(16, 5), sharey=False)

x = np.arange(len(labels_connected))

axes[0].bar(x, counts_f, color='#1f77b4', edgecolor='black', linewidth=0.5)
axes[0].set_xticks(x)
axes[0].set_xticklabels(labels_connected, rotation=45, ha='right', fontsize=9)
axes[0].set_ylabel('Count')
axes[0].set_title('Factual Recall: "Michael Jordan plays the sport of"', fontsize=11)

axes[1].bar(x, counts_m, color='#ff7f0e', edgecolor='black', linewidth=0.5)
axes[1].set_xticks(x)
axes[1].set_xticklabels(labels_connected, rotation=45, ha='right', fontsize=9)
axes[1].set_ylabel('Count')
axes[1].set_title('Multi-Hop: "The capital of the state containing Dallas is"', fontsize=11)

plt.suptitle('Raw Triad Census Comparison', fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'comparison_raw_counts.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved: comparison_raw_counts.png')

## 3. Null Model: Configuration Model (1000 random graphs each)

This is the main computation — degree-preserving edge rewiring for both graphs.

In [None]:
print('Running null model for FACTUAL RECALL (1000 random graphs)...')
null_factual = generate_configuration_null(g_factual, n_random=1000, motif_size=3)
print(f'Done. Z-score range: [{null_factual.z_scores.min():.1f}, {null_factual.z_scores.max():.1f}]')

print()
print('Running null model for MULTI-HOP (1000 random graphs)...')
null_multihop = generate_configuration_null(g_multihop, n_random=1000, motif_size=3)
print(f'Done. Z-score range: [{null_multihop.z_scores.min():.1f}, {null_multihop.z_scores.max():.1f}]')

## 4. Z-Score Comparison Table

In [None]:
z_f = null_factual.z_scores
z_m = null_multihop.z_scores
sp_f = null_factual.significance_profile
sp_m = null_multihop.significance_profile

# Key motifs to highlight
key_motifs = {
    MOTIF_FAN_IN: '021U (Fan-in)',
    MOTIF_CHAIN: '021C (Chain)',
    MOTIF_FAN_OUT: '021D (Fan-out)',
    MOTIF_FFL: '030T (FFL)',
    MOTIF_CYCLE: '030C (Cycle)',
}

print(f'{"Triad":>8}  {"Z (Factual)":>12}  {"Z (Multihop)":>12}  {"SP (Factual)":>12}  {"SP (Multihop)":>12}  {"Delta Z":>8}')
print('=' * 78)
for i in CONNECTED_TRIAD_INDICES:
    label = TRIAD_LABELS[i]
    delta = z_f[i] - z_m[i]
    marker = ' ***' if i in key_motifs else ''
    print(f'{label:>8}  {z_f[i]:>12.2f}  {z_m[i]:>12.2f}  {sp_f[i]:>12.3f}  {sp_m[i]:>12.3f}  {delta:>8.2f}{marker}')

print()
print('*** = Key motifs to watch')
print()
print('--- Key Findings ---')
for idx, name in key_motifs.items():
    print(f'  {name}: Factual Z={z_f[idx]:.2f}, Multihop Z={z_m[idx]:.2f}, Delta={z_f[idx]-z_m[idx]:.2f}')

## 5. Side-by-Side Z-Score Bar Charts (Main Result Figure)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 6), sharey=True)

threshold = 2.0
indices = CONNECTED_TRIAD_INDICES
label_plot = [TRIAD_LABELS[i] for i in indices]
x = np.arange(len(indices))

for ax, z, title, color_main in [
    (axes[0], z_f[indices], 'Factual Recall\n"Michael Jordan plays the sport of"', '#1f77b4'),
    (axes[1], z_m[indices], 'Multi-Hop Reasoning\n"Capital of the state containing Dallas"', '#ff7f0e'),
]:
    colors = ['#d62728' if zi > threshold else '#1f77b4' if zi < -threshold else '#7f7f7f' for zi in z]
    ax.bar(x, z, color=colors, edgecolor='black', linewidth=0.5)
    ax.axhline(y=threshold, color='red', linestyle='--', alpha=0.5)
    ax.axhline(y=-threshold, color='red', linestyle='--', alpha=0.5)
    ax.axhline(y=0, color='black', linewidth=0.5)
    ax.set_xticks(x)
    ax.set_xticklabels(label_plot, rotation=45, ha='right', fontsize=9)
    ax.set_title(title, fontsize=12)
    ax.set_ylabel('Z-score')

# Shared legend
enriched_patch = mpatches.Patch(color='#d62728', label='Enriched (Z > 2)')
depleted_patch = mpatches.Patch(color='#1f77b4', label='Anti-enriched (Z < -2)')
ns_patch = mpatches.Patch(color='#7f7f7f', label='Not significant')
fig.legend(handles=[enriched_patch, depleted_patch, ns_patch], 
           loc='upper center', ncol=3, fontsize=10, bbox_to_anchor=(0.5, 0.02))

plt.suptitle('Motif Z-Score Profiles: Factual Recall vs Multi-Hop Reasoning', fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'comparison_zscore_sidebyside.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved: comparison_zscore_sidebyside.png')

## 6. Overlaid SP Profile Comparison (Key Figure)

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))

x = np.arange(len(indices))
width = 0.35

bars1 = ax.bar(x - width/2, sp_f[indices], width, label='Factual Recall', 
               color='#1f77b4', edgecolor='black', linewidth=0.5, alpha=0.85)
bars2 = ax.bar(x + width/2, sp_m[indices], width, label='Multi-Hop Reasoning', 
               color='#ff7f0e', edgecolor='black', linewidth=0.5, alpha=0.85)

ax.axhline(y=0, color='black', linewidth=0.5)
ax.set_xticks(x)
ax.set_xticklabels(label_plot, rotation=45, ha='right', fontsize=10)
ax.set_ylabel('Significance Profile (SP)', fontsize=12)
ax.set_title('Significance Profile Comparison: Factual Recall vs Multi-Hop', fontsize=14)
ax.legend(fontsize=11)

# Highlight key motifs with annotations
key_indices_in_plot = {
    'Fan-in': CONNECTED_TRIAD_INDICES.index(MOTIF_FAN_IN),
    'Chain': CONNECTED_TRIAD_INDICES.index(MOTIF_CHAIN),
    'Fan-out': CONNECTED_TRIAD_INDICES.index(MOTIF_FAN_OUT),
    'FFL': CONNECTED_TRIAD_INDICES.index(MOTIF_FFL),
}

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'comparison_sp_overlay.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved: comparison_sp_overlay.png')

## 7. Key Motif Deep Dive: Null Distributions

In [None]:
# Show null distributions for the 4 key motifs, both graphs
key_motif_indices = [MOTIF_FAN_IN, MOTIF_CHAIN, MOTIF_FAN_OUT, MOTIF_FFL]
key_motif_names = ['021U (Fan-in)', '021C (Chain)', '021D (Fan-out)', '030T (FFL)']

fig, axes = plt.subplots(2, 4, figsize=(20, 8))

for col, (motif_idx, motif_name) in enumerate(zip(key_motif_indices, key_motif_names)):
    for row, (null_result, graph_label, color) in enumerate([
        (null_factual, 'Factual', '#1f77b4'),
        (null_multihop, 'Multi-Hop', '#ff7f0e'),
    ]):
        ax = axes[row, col]
        null_vals = null_result.null_counts[:, motif_idx]
        real_val = null_result.real_counts[motif_idx]
        
        ax.hist(null_vals, bins=30, color=color, alpha=0.7, edgecolor='black', linewidth=0.3)
        ax.axvline(x=real_val, color='red', linewidth=2, linestyle='--', label=f'Real = {int(real_val)}')
        ax.axvline(x=null_result.mean_null[motif_idx], color='black', linewidth=1, 
                   linestyle=':', label=f'Null mean = {null_result.mean_null[motif_idx]:.0f}')
        
        z = null_result.z_scores[motif_idx]
        ax.set_title(f'{graph_label}: {motif_name}\nZ = {z:.1f}', fontsize=10)
        ax.legend(fontsize=7)
        if col == 0:
            ax.set_ylabel('Frequency')

plt.suptitle('Null Model Distributions for Key Motifs', fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'comparison_null_distributions.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved: comparison_null_distributions.png')

## 8. Delta Z-Score Analysis

Which motifs differentiate factual recall from multi-hop most strongly?

In [None]:
# Delta Z: Factual - Multihop (positive = more enriched in factual)
delta_z = z_f - z_m
delta_z_connected = delta_z[indices]

fig, ax = plt.subplots(figsize=(14, 5))

colors = ['#2ca02c' if d > 2 else '#d62728' if d < -2 else '#7f7f7f' for d in delta_z_connected]
ax.bar(x, delta_z_connected, color=colors, edgecolor='black', linewidth=0.5)
ax.axhline(y=0, color='black', linewidth=1)
ax.axhline(y=2, color='gray', linestyle='--', alpha=0.4)
ax.axhline(y=-2, color='gray', linestyle='--', alpha=0.4)

ax.set_xticks(x)
ax.set_xticklabels(label_plot, rotation=45, ha='right', fontsize=10)
ax.set_ylabel('Delta Z-score (Factual - Multihop)', fontsize=12)
ax.set_title('Motif Enrichment Difference: Factual Recall vs Multi-Hop', fontsize=14)

# Legend
more_factual = mpatches.Patch(color='#2ca02c', label='More enriched in Factual')
more_multihop = mpatches.Patch(color='#d62728', label='More enriched in Multi-Hop')
ns = mpatches.Patch(color='#7f7f7f', label='|Delta| < 2')
ax.legend(handles=[more_factual, more_multihop, ns], fontsize=10)

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'comparison_delta_z.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved: comparison_delta_z.png')

print()
print('Top differentiating motifs (by |Delta Z|):')
delta_ranked = sorted(zip(label_plot, delta_z_connected), key=lambda x: abs(x[1]), reverse=True)
for label, dz in delta_ranked[:6]:
    direction = 'Factual > Multihop' if dz > 0 else 'Multihop > Factual'
    print(f'  {label:>6}: Delta Z = {dz:+.2f}  ({direction})')

## 9. Cosine Similarity Between Profiles

In [None]:
from scipy.spatial.distance import cosine

# SP cosine similarity
sp_cos = 1.0 - cosine(sp_f, sp_m)
print(f'Cosine similarity (SP vectors):  {sp_cos:.4f}')

# Z-score cosine similarity
z_finite_f = np.where(np.isinf(z_f), np.sign(z_f) * 100, z_f)
z_finite_m = np.where(np.isinf(z_m), np.sign(z_m) * 100, z_m)
z_cos = 1.0 - cosine(z_finite_f, z_finite_m)
print(f'Cosine similarity (Z-score vectors): {z_cos:.4f}')

# Connected-only similarity
sp_cos_connected = 1.0 - cosine(sp_f[indices], sp_m[indices])
print(f'Cosine similarity (SP, connected only): {sp_cos_connected:.4f}')

print()
if sp_cos > 0.9:
    print('Interpretation: Very similar profiles — both tasks share the same broad structural patterns.')
elif sp_cos > 0.7:
    print('Interpretation: Moderately similar — same general patterns but with notable differences in specific motifs.')
else:
    print('Interpretation: Quite different profiles — these tasks leave distinct structural fingerprints!')

## 10. Enrichment Summary Table

In [None]:
def enrichment_status(z, threshold=2.0):
    if z > threshold:
        return 'ENRICHED'
    elif z < -threshold:
        return 'ANTI-ENRICHED'
    else:
        return 'n.s.'

print('Enrichment Summary')
print(f'{"Triad":>8}  {"Factual":>15}  {"Multihop":>15}  {"Same?":>6}')
print('=' * 55)
for i in CONNECTED_TRIAD_INDICES:
    status_f = enrichment_status(z_f[i])
    status_m = enrichment_status(z_m[i])
    same = 'Yes' if status_f == status_m else '** NO **'
    label = TRIAD_LABELS[i]
    print(f'{label:>8}  {status_f:>15}  {status_m:>15}  {same:>8}')

# Count differences
n_diff = sum(1 for i in CONNECTED_TRIAD_INDICES 
             if enrichment_status(z_f[i]) != enrichment_status(z_m[i]))
print(f'\nMotifs with different enrichment status: {n_diff} / {len(CONNECTED_TRIAD_INDICES)}')

## 11. Combined Heatmap View

In [None]:
# Build a 2-row heatmap: Z-scores for both graphs
fig, ax = plt.subplots(figsize=(14, 3.5))

data = np.array([
    z_f[indices],
    z_m[indices],
])

# Cap values for better visualization
data_capped = np.clip(data, -30, 30)

sns.heatmap(
    data_capped,
    xticklabels=label_plot,
    yticklabels=['Factual Recall', 'Multi-Hop'],
    cmap='RdBu_r',
    center=0,
    annot=data.astype(int),  # show actual values
    fmt='d',
    linewidths=0.5,
    ax=ax,
    cbar_kws={'label': 'Z-score'},
)

ax.set_title('Z-Score Heatmap: Factual Recall vs Multi-Hop', fontsize=14)
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'comparison_zscore_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved: comparison_zscore_heatmap.png')

## 12. Interpretation and Key Findings

### Hypotheses Tested

1. **Chain enrichment (021C) drops for single-hop**: Multi-hop reasoning requires sequential processing (A→B→C), so we expect 021C to be significantly more enriched in multi-hop than in factual recall.

2. **Feedforward loop (030T) stays enriched in both**: FFL is a convergent evidence motif — it appears when two features independently contribute to the same downstream target. This should be a general feature of attribution graphs, not task-specific.

3. **Fan-out (021D) flips from anti-enriched to enriched**: In single-hop factual recall, a single "lookup" feature broadcasts to multiple targets (fan-out). In multi-hop, the information flows through intermediate steps instead.

### Structural Fingerprint Story

If we observe these patterns, it demonstrates that different types of LLM computation leave measurably different structural signatures in their attribution graphs — the core claim of the blog post.

In [None]:
print('=== CORE RESULTS ===')
print()

# Hypothesis 1: Chain enrichment
print(f'1. Chain (021C) enrichment:')
print(f'   Factual:  Z = {z_f[MOTIF_CHAIN]:+.2f}  ({enrichment_status(z_f[MOTIF_CHAIN])})')
print(f'   Multihop: Z = {z_m[MOTIF_CHAIN]:+.2f}  ({enrichment_status(z_m[MOTIF_CHAIN])})')
chain_diff = z_m[MOTIF_CHAIN] - z_f[MOTIF_CHAIN]
if chain_diff > 2:
    print(f'   --> CONFIRMED: Chain is {chain_diff:.1f} Z-score units more enriched in multi-hop')
elif chain_diff > 0:
    print(f'   --> Partial: Chain is slightly more enriched in multi-hop (Delta = {chain_diff:.1f})')
else:
    print(f'   --> UNEXPECTED: Chain is more enriched in factual recall (Delta = {chain_diff:.1f})')

print()

# Hypothesis 2: FFL stays enriched  
print(f'2. Feedforward Loop (030T):')
print(f'   Factual:  Z = {z_f[MOTIF_FFL]:+.2f}  ({enrichment_status(z_f[MOTIF_FFL])})')
print(f'   Multihop: Z = {z_m[MOTIF_FFL]:+.2f}  ({enrichment_status(z_m[MOTIF_FFL])})')
if enrichment_status(z_f[MOTIF_FFL]) == 'ENRICHED' and enrichment_status(z_m[MOTIF_FFL]) == 'ENRICHED':
    print(f'   --> CONFIRMED: FFL is enriched in BOTH tasks (universal structural motif)')
else:
    print(f'   --> FFL enrichment differs between tasks')

print()

# Hypothesis 3: Fan-out flip
print(f'3. Fan-out (021D):')
print(f'   Factual:  Z = {z_f[MOTIF_FAN_OUT]:+.2f}  ({enrichment_status(z_f[MOTIF_FAN_OUT])})')
print(f'   Multihop: Z = {z_m[MOTIF_FAN_OUT]:+.2f}  ({enrichment_status(z_m[MOTIF_FAN_OUT])})')
status_f_fo = enrichment_status(z_f[MOTIF_FAN_OUT])
status_m_fo = enrichment_status(z_m[MOTIF_FAN_OUT])
if status_f_fo != status_m_fo:
    print(f'   --> CONFIRMED: Fan-out has different enrichment status between tasks!')
else:
    fanout_diff = z_f[MOTIF_FAN_OUT] - z_m[MOTIF_FAN_OUT] 
    print(f'   --> Same status, but Delta Z = {fanout_diff:.2f}')

print()

# Fan-in bonus
print(f'4. Fan-in (021U) bonus:')
print(f'   Factual:  Z = {z_f[MOTIF_FAN_IN]:+.2f}  ({enrichment_status(z_f[MOTIF_FAN_IN])})')
print(f'   Multihop: Z = {z_m[MOTIF_FAN_IN]:+.2f}  ({enrichment_status(z_m[MOTIF_FAN_IN])})')

print()
print(f'Overall SP cosine similarity: {sp_cos:.4f}')
if sp_cos < 0.85:
    print('These task types have DISTINCT structural fingerprints.')
else:
    print('Profiles are similar but differ in key motifs.')