# Rank‑Based Diffusion | Facebook Pages (Top 2 000)
*Proof‑of‑concept analysis notebook — generated 2025‑07‑24*

This notebook implements the workflow defined in **Prompt v0.2**:

1. Data QC and pseudo‑share construction  
2. Weekday‑aware 7‑day transition matrices  
3. Parameter estimation (σ, κ, η, β̂)  
4. Euler–Maruyama simulation & diagnostics  
5. Artefact export (`results/`)

> **Re‑run instructions**  
> • Requires Python 3.11 + `numpy`, `pandas`, `duckdb`, `pyarrow`, `matplotlib`, `scipy`.  
> • Set the variable `PROJECT_ROOT` below to your project directory.

In [1]:
################################################################################
# 1 | Setup & imports                                                          #
################################################################################
import os, sys, json, math, itertools, gzip, datetime, pathlib, warnings
from typing import Tuple

import numpy as np
import pandas as pd
import duckdb as ddb
import pyarrow as pa
import pyarrow.dataset as ds
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from scipy.optimize import curve_fit

plt.rcParams.update({"figure.dpi": 120, "axes.spines.top": False,
                     "axes.spines.right": False})

# Reproducibility
np.random.seed(42)

# Project paths
PROJECT_ROOT = pathlib.Path("./data/rank-diffusion")               # ← change if needed
DATA_PROMPT   = PROJECT_ROOT/."data/rank-diffusion/data/fb_top2000_ranked_daily.parquet"
DATA_UPLOAD   = pathlib.Path("/mnt/data/fb_top2000_ranked_daily.parquet")

RESULTS_DIR   = PROJECT_ROOT/"results"
FIG_DIR       = RESULTS_DIR/"figures"
TABLE_DIR     = RESULTS_DIR/"tables"
MATRIX_DIR    = RESULTS_DIR/"matrices"

for p in [FIG_DIR, TABLE_DIR, MATRIX_DIR]:
    p.mkdir(parents=True, exist_ok=True)

## 2 | Load daily panel & basic QC

In [2]:
def load_panel() -> pd.DataFrame:
    """Read the Parquet file via DuckDB streaming; return a pandas DF."""
    path = DATA_PROMPT if DATA_PROMPT.exists() else DATA_UPLOAD
    if not path.exists():
        raise FileNotFoundError("Parquet file not found in either location.")
    con = ddb.connect()
    df = con.execute(f"""
        SELECT date, endpoint_id, metric_value, rank
        FROM parquet_scan('{path}')
    """).fetch_df()
    df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
    df["weekday"] = df["date"].dt.day_name()
    return df

df = load_panel()
print(df.head())
print(f"Loaded {len(df):,} rows spanning {df['date'].min()} ▸ {df['date'].max()}")

FileNotFoundError: Parquet file not found in either location.

In [None]:
# Quick missing‑date check
all_dates = pd.date_range(df["date"].min(), df["date"].max(), freq="D")
missing   = sorted(set(all_dates.date) - set(df["date"].dt.date))
print(f"Missing calendar dates: {len(missing)} (listed if ≤10)")")
print(missing[:10])

### 2.1 Winsorise extreme interaction counts

In [None]:
q = df["metric_value"].quantile(0.999)
df["metric_w"] = np.where(df["metric_value"] > q, q, df["metric_value"])
print(f"99.9‑th percentile = {q:,.0f}; clipped values now ≤ that.")

## 3 | Pseudo‑shares & state encoding (1…2001)

In [None]:
def add_shares(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    daily_tot = df.groupby("date")["metric_w"].transform("sum")
    df["share"] = df["metric_w"] / daily_tot
    return df

df = add_shares(df)

### 3.1 Pivot to wide "state" format (rank per page per day)

In [None]:
# Build a mapping date → endpoint → rank, then fill missing as 2001
rank_piv = (
    df.pivot(index="date", columns="endpoint_id", values="rank")
      .fillna(2001).astype(np.int16)
)

share_piv = (
    df.pivot(index="date", columns="endpoint_id", values="share")
      .fillna(0.0).astype(np.float32)
)

## 4 | 7‑day Transition Matrices  $P^{d}$

In [None]:
def build_transition(df_rank: pd.DataFrame, weekday: str) -> np.ndarray:
    """Return (2001×2001) transition probability matrix for given weekday."""
    rows = df_rank.loc[df_rank.index.day_name()==weekday]
    t0   = rows.values[:-1]
    t7   = rows.shift(-7).values[:-1]
    valid = ~np.isnan(t7).any(axis=1)
    t0, t7 = t0[valid], t7[valid]
    counts = np.zeros((2001, 2001), dtype=np.int64)
    for r0, r7 in zip(t0, t7):
        np.add.at(counts, (r0-1, r7-1), 1)
    P = counts / counts.sum(axis=1, keepdims=True)
    return P

weekday_list = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
for wd in weekday_list:
    P = build_transition(rank_piv, wd)
    np.savez_compressed(MATRIX_DIR/f"P_{wd}.npz", P=P)

## 5 | Parameter Estimation (σ, κ, η, β)

In [None]:
# 5.1 Variance funnel → σ
rank_var = (share_piv.apply(np.log).diff().var())
var_df = rank_var.groupby(rank_piv.iloc[0]).mean().to_frame("var")
var_df["log_r"] = np.log(var_df.index)
sigma2, _ = np.polyfit(var_df["log_r"], var_df["var"], 1)
sigma = math.sqrt(max(sigma2, 1e-12))
print(f"Estimated σ² = {sigma2:.3e}  ⇒  σ = {sigma:.4f}")

# 5.2 Drift → κ, η
mean_dlog = share_piv.apply(np.log).diff().mean()
drift_df  = mean_dlog.groupby(rank_piv.iloc[0]).mean().to_frame("dlog")
eta, kappa = np.polyfit(drift_df.index, drift_df["dlog"], 1)
eta   = -eta  # slope is negative
print(f"Estimated η = {eta:.4e}, κ = {kappa:.4e}")

# 5.3 Variance‑time exponent β (check)
horizons = np.array([1,2,4,8,28])
band_edges = [(1,50),(51,200),(201,2000)]
records = []
log_sh  = share_piv.apply(np.log)
for lo, hi in band_edges:
    for h in horizons:
        v = log_sh.diff(h).iloc[h:]
        band_mask = rank_piv.iloc[0].between(lo, hi)
        records.append({"band":f"{lo}-{hi}", "h":h,
                        "var": v.loc[:, band_mask].var().mean()})
beta_df = pd.DataFrame(records)

def var_power(t, A, beta):
    return A * t**beta

beta_estimates = []
for band, grp in beta_df.groupby("band"):
    popt, pcov = curve_fit(var_power, grp["h"], grp["var"], p0=[1e-8,1.0])
    A_hat, beta_hat = popt
    ss_res = ((grp["var"] - var_power(grp["h"], *popt))**2).sum()
    ss_tot = ((grp["var"] - grp["var"].mean())**2).sum()
    R2 = 1 - ss_res/ss_tot
    beta_estimates.append({"band":band,"beta":beta_hat,"R2":R2})
beta_tbl = pd.DataFrame(beta_estimates)
beta_tbl.to_csv(TABLE_DIR/"variance_time.csv", index=False)
beta_tbl

## 6 | Euler–Maruyama Simulation

In [None]:
def simulate_paths(n_paths:int=10000, n_days:int=365):
    """Return array shaped (n_paths, n_days, 2001) of simulated shares."""
    w0 = var_df.index.to_series().sort_index().index  # placeholder equal weights
    w0 = var_df.index.map(lambda r: df.loc[df['rank']==r,'share'].mean()).values
    w0 = np.where(w0==0, 1e-12, w0)
    w0 = w0 / w0.sum()
    out = np.empty((n_paths, n_days, 2001), dtype=np.float32)
    out[:,:,2000] = 0.0  # absorbing prob mass; placeholder
    for p in range(n_paths):
        w = w0.copy()
        out[p,0,:2000] = w
        for t in range(1, n_days):
            drift = (kappa - eta*np.arange(1,2001))
            vol   = sigma*np.sqrt(np.log(np.arange(1,2001)))
            dB    = np.random.normal(scale=math.sqrt(1.0), size=2000)
            w = w * np.exp(drift + vol*dB)
            w = np.maximum(w, 1e-12)
            w = w / w.sum()
            out[p,t,:2000] = w
    return out

# Simulations are heavy → run a small batch for diagnostic
_sim = simulate_paths(n_paths=100, n_days=365)

## 7 | Diagnostics & Figures

In [None]:
# Example: funnel plot (empirical vs. simulation) for one random sample
emp_dlog = log_sh.diff().stack().dropna()
emp_rank = rank_piv.stack().reindex(emp_dlog.index)
fig, ax = plt.subplots(figsize=(6,4))
ax.scatter(np.log(emp_rank), emp_dlog, s=1, alpha=0.05, label="Empirical")
ax.set_xlabel("log(rank)"); ax.set_ylabel("Δ log share")
ax.set_title("Funnel of churn – empirical (daily)")
plt.tight_layout(); fig.savefig(FIG_DIR/"funnel_empirical.png")

*Repeat diagnostic plots and survival‑prob tables as per prompt…*

## 8 | Conclusions / Next Steps

* Summarise parameter values and goodness‑of‑fit metrics.
* Note whether a jump component seems necessary (left‑tail diagnostics).
* Outline work to port model to Reddit and to extend list length.

### Provenance
Data source: CrowdTangle export of top‑2 000 U.S. Facebook pages, 2023‑01‑01 → 2024‑01‑31.  
Notebook autogenerated by OpenAI o3 assistant (conversation ID …).  
Run date: {{ cookiecutter.date }}