# DFBI PAN Experiments (Grid & Visualization)

This notebook runs a parameter sweep on a prepared PAN-like corpus and visualizes results.
Expected corpus layout: `corpus_root/author/*.txt`.


In [None]:

from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from dfbi.bench import load_folder_corpus, run_grid

# === Configure here ===
corpus_root = Path('./bench/mini_corpus')  # CHANGE to your PAN corpus, e.g., Path('./corpus_pan12')
alphabet    = 'en'                          # 'en' for PAN in English (if using Russian text, set 'ru')
horizons    = [1,2,3,4,5,6]
metrics     = ['l1','l2','chi2']
masks       = ['letters']
normalize   = 'global'
decay       = ('exp', 0.7)

corpus = load_folder_corpus(corpus_root)
len(corpus), list(corpus.keys())[:5]


In [None]:

results = run_grid(corpus, horizons=horizons, metrics=metrics, masks=masks,
                   normalize=normalize, decay=decay)
df = pd.DataFrame(results)
df.sort_values(['mask','metric','horizon']).reset_index(drop=True)


In [None]:

def plot_accuracy(df, title):
    plt.figure(figsize=(6,4))
    for metric in sorted(df['metric'].unique()):
        d = df[df['metric']==metric].sort_values('horizon')
        plt.plot(d['horizon'], d['loocv_acc']*100, label=metric)
    plt.xlabel('Horizon h'); plt.ylabel('LOOCV accuracy, %'); plt.title(title)
    plt.legend(); plt.tight_layout(); plt.show()

def plot_throughput(df, title):
    plt.figure(figsize=(6,4))
    for metric in sorted(df['metric'].unique()):
        d = df[df['metric']==metric].sort_values('horizon')
        plt.plot(d['horizon'], d['throughput_MBps'], label=metric)
    plt.xlabel('Horizon h'); plt.ylabel('Throughput (MB/s)'); plt.title(title)
    plt.legend(); plt.tight_layout(); plt.show()

for mask in sorted(df['mask'].unique()):
    dfx = df[df['mask']==mask]
    plot_accuracy(dfx, f'Accuracy vs Horizon (mask={mask})')
    plot_throughput(dfx, f'Throughput vs Horizon (mask={mask})')
