# Quantum Data Selection - Experiment 0

**原理検証: 量子アニーリングによる高Surpriseデータ選択**

## 目標
- 100サンプルから10サンプルを選択
- 量子選択 vs ランダム選択の比較
- Surprise最大化を確認

## 実行時間: 5-10分

## 必要: D-Wave APIトークン (無料アカウント: https://cloud.dwavesys.com/)

## セル1: インストール

Google Colabで実行する場合、このセルを最初に実行してください。

In [None]:
!pip install transformers datasets dwave-ocean-sdk torch matplotlib seaborn -q

## セル2: インポート

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset
from dwave.system import LeapHybridSampler
import warnings
warnings.filterwarnings('ignore')

print("All imports successful")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## セル3: D-Wave API設定

In [None]:
# Option 1: 環境変数から読み込み
import os

# Option 2: 直接設定（テスト用のみ、本番では環境変数を使用）
# os.environ['DWAVE_API_TOKEN'] = 'your-token-here'

# D-Wave接続テスト
try:
    sampler = LeapHybridSampler()
    print("D-Wave API connection successful")
    print(f"  Solver: {sampler.solver.name}")
except Exception as e:
    print("D-Wave API connection failed")
    print(f"  Error: {e}")
    print("\nPlease set your API token:")
    print("1. Sign up at https://cloud.dwavesys.com/ (free)")
    print("2. Get your token from the dashboard")
    print("3. Set: os.environ['DWAVE_API_TOKEN'] = 'your-token'")
    raise

## セル4: データ準備

In [None]:
print("Loading WikiText-2 dataset...")
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

# フィルタリング: 50文字以上のテキストのみ
texts_raw = [x['text'] for x in dataset if len(x['text']) > 50]

# 最初の100サンプルを使用
N_SAMPLES = 100
texts = texts_raw[:N_SAMPLES]

print(f"Loaded {len(texts)} text samples")
print(f"\nExample text:")
print(f"  '{texts[0][:100]}...'")
print(f"  Length: {len(texts[0])} chars")

## セル5: Proxy Model（DistilGPT-2）のロード

In [None]:
print("\nLoading DistilGPT-2 model...")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

# GPUがあれば使用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

print(f"Model loaded on {device}")
print(f"  Parameters: {sum(p.numel() for p in model.parameters()):,}")

## セル6: Surprise計算

Surpriseが高い = モデルにとって予測困難 = 情報価値が高い

In [None]:
def compute_surprise(text, model, tokenizer, max_length=50):
    """
    テキストのSurpriseを計算（negative log-likelihood）

    Surpriseが高い = モデルにとって予測困難 = 情報価値が高い
    """
    try:
        # トークン化
        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=max_length,
            padding="max_length"
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Loss計算（= negative log-likelihood）
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss.item()

        return loss

    except Exception as e:
        print(f"Warning: Failed to compute surprise for text: {e}")
        return 0.0

print("Computing surprises for all samples...")
print("(This may take 2-3 minutes)")

surprises = []
for i, text in enumerate(texts):
    if i % 20 == 0:
        print(f"  Progress: {i}/{N_SAMPLES}")

    surprise = compute_surprise(text, model, tokenizer)
    surprises.append(surprise)

surprises = np.array(surprises)

print(f"\nSurprise computation complete")
print(f"  Mean: {surprises.mean():.3f}")
print(f"  Std:  {surprises.std():.3f}")
print(f"  Min:  {surprises.min():.3f}")
print(f"  Max:  {surprises.max():.3f}")

## セル7: QUBO構築

目的関数:

$$H = -\alpha \cdot \sum_i S_i \cdot x_i + \gamma \cdot \left(\sum_i x_i - K\right)^2$$

- 第1項: Surprise最大化（高い値を選ぶ）
- 第2項: 制約（ちょうどK個選ぶ）

In [None]:
def build_qubo(surprises, K=10, alpha=1.0, gamma=10.0):
    """
    QUBO行列を構築

    目的関数:
    H = -alpha * sum(S_i * x_i) + gamma * (sum(x_i) - K)^2

    第1項: Surprise最大化（高い値を選ぶ）
    第2項: 制約（ちょうどK個選ぶ）

    Parameters
    ----------
    surprises : array
        各サンプルのSurprise値
    K : int
        選択するサンプル数
    alpha : float
        Surpriseの重み
    gamma : float
        制約の強さ
    """
    N = len(surprises)
    Q = {}

    # 対角項 + 制約の線形項
    for i in range(N):
        # -alpha * S_i（Surprise最大化）
        # + gamma * (1 - 2K)（制約展開の線形項）
        Q[(i, i)] = -alpha * surprises[i] + gamma * (1 - 2*K)

    # オフ対角項（制約展開の二次項）
    for i in range(N):
        for j in range(i+1, N):
            Q[(i, j)] = 2 * gamma

    return Q

# パラメータ
K = 10          # 選択するサンプル数（10%）
alpha = 1.0     # Surprise重み
gamma = 10.0    # 制約強度

print(f"Building QUBO matrix...")
print(f"  N = {N_SAMPLES} (total samples)")
print(f"  K = {K} (samples to select)")
print(f"  alpha = {alpha} (surprise weight)")
print(f"  gamma = {gamma} (constraint strength)")

Q = build_qubo(surprises, K=K, alpha=alpha, gamma=gamma)

print(f"QUBO matrix built")
print(f"  Size: {len(Q)} entries")
print(f"  Variables: {N_SAMPLES}")

## セル8: 量子アニーリング実行

In [None]:
print("\nRunning quantum annealing on D-Wave...")
print("(This may take 30-60 seconds)")

sampler = LeapHybridSampler()

response = sampler.sample_qubo(
    Q,
    label='QuantumFEP-Experiment0'
)

# 最良解を取得
solution = response.first.sample
selected_quantum = [i for i, v in solution.items() if v == 1]

print(f"\nQuantum annealing complete")
print(f"  Selected samples: {len(selected_quantum)}")
print(f"  Energy: {response.first.energy:.2f}")
print(f"  Timing: {response.info.get('qpu_access_time', 'N/A')}")

## セル9: ランダム選択との比較

In [None]:
print("Generating random baseline...")

# ランダム選択（複数回試行して平均）
N_RANDOM_TRIALS = 100
random_surprises = []

for _ in range(N_RANDOM_TRIALS):
    selected_random = np.random.choice(N_SAMPLES, K, replace=False)
    avg_surprise = surprises[selected_random].mean()
    random_surprises.append(avg_surprise)

random_surprise_mean = np.mean(random_surprises)
random_surprise_std = np.std(random_surprises)

# 量子選択の結果
quantum_surprise = surprises[selected_quantum].mean()

print(f"\n{'='*60}")
print(f"RESULTS")
print(f"{'='*60}")
print(f"\nQuantum Selection:")
print(f"  Average Surprise: {quantum_surprise:.4f}")
print(f"  Selected indices: {sorted(selected_quantum)[:10]}...")

print(f"\nRandom Selection (n={N_RANDOM_TRIALS} trials):")
print(f"  Average Surprise: {random_surprise_mean:.4f} +/- {random_surprise_std:.4f}")

print(f"\nComparison:")
improvement = (quantum_surprise / random_surprise_mean - 1) * 100
print(f"  Improvement: {improvement:+.2f}%")

if quantum_surprise > random_surprise_mean:
    print(f"  Quantum selection outperforms random!")

    # 統計的有意性の簡易チェック
    z_score = (quantum_surprise - random_surprise_mean) / random_surprise_std
    print(f"  Z-score: {z_score:.2f}")

    if z_score > 2:
        print(f"  Statistically significant (p < 0.05)")
else:
    print(f"  Quantum selection underperforms")

## セル10: 可視化

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Surprise分布
ax = axes[0, 0]
ax.hist(surprises, bins=30, alpha=0.7, color='gray', label='All samples')
ax.axvline(quantum_surprise, color='red', linestyle='--', linewidth=2,
           label=f'Quantum avg: {quantum_surprise:.3f}')
ax.axvline(random_surprise_mean, color='blue', linestyle='--', linewidth=2,
           label=f'Random avg: {random_surprise_mean:.3f}')
ax.set_xlabel('Surprise (negative log-likelihood)')
ax.set_ylabel('Frequency')
ax.set_title('Surprise Distribution')
ax.legend()
ax.grid(True, alpha=0.3)

# Plot 2: 選択されたサンプルのSurprise
ax = axes[0, 1]
selected_surprises = surprises[selected_quantum]
ax.scatter(range(len(selected_surprises)), sorted(selected_surprises)[::-1],
          color='red', s=100, alpha=0.6, label='Quantum selected')
ax.axhline(random_surprise_mean, color='blue', linestyle='--', linewidth=2,
          label='Random average')
ax.set_xlabel('Sample rank')
ax.set_ylabel('Surprise')
ax.set_title('Selected Samples (Quantum)')
ax.legend()
ax.grid(True, alpha=0.3)

# Plot 3: ランダム選択の分布
ax = axes[1, 0]
ax.hist(random_surprises, bins=30, alpha=0.7, color='blue',
        label=f'Random (n={N_RANDOM_TRIALS})')
ax.axvline(quantum_surprise, color='red', linestyle='--', linewidth=2,
          label='Quantum')
ax.set_xlabel('Average Surprise')
ax.set_ylabel('Frequency')
ax.set_title('Random Selection Distribution')
ax.legend()
ax.grid(True, alpha=0.3)

# Plot 4: 全サンプルのSurpriseと選択状況
ax = axes[1, 1]
colors = ['red' if i in selected_quantum else 'gray'
          for i in range(N_SAMPLES)]
alphas = [0.8 if i in selected_quantum else 0.3
          for i in range(N_SAMPLES)]
ax.scatter(range(N_SAMPLES), surprises, c=colors, alpha=0.5, s=50)
ax.axhline(quantum_surprise, color='red', linestyle='--', linewidth=1,
          label='Quantum avg')
ax.axhline(random_surprise_mean, color='blue', linestyle='--', linewidth=1,
          label='Random avg')
ax.set_xlabel('Sample index')
ax.set_ylabel('Surprise')
ax.set_title('All Samples (red = selected by quantum)')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('experiment0_results.png', dpi=150, bbox_inches='tight')
print("\nVisualization saved: experiment0_results.png")
plt.show()

## セル11: 選択されたテキストの例

In [None]:
print("=" * 60)
print("SELECTED TEXT EXAMPLES")
print("=" * 60)

# Surprise順にソート
sorted_indices = sorted(selected_quantum,
                       key=lambda i: surprises[i],
                       reverse=True)

print("\nTop 3 highest-surprise texts selected by quantum:\n")
for rank, idx in enumerate(sorted_indices[:3], 1):
    print(f"{rank}. Sample {idx} (Surprise = {surprises[idx]:.3f})")
    print(f"   Text: {texts[idx][:150]}...")
    print()

## 完了・まとめ

In [None]:
print("=" * 60)
print("EXPERIMENT 0 COMPLETE")
print("=" * 60)
print("\nKey Findings:")
print(f"  Quantum annealing successfully selected {K} samples")
print(f"  Average surprise: {quantum_surprise:.4f} (quantum) vs {random_surprise_mean:.4f} (random)")
print(f"  Improvement: {improvement:+.2f}%")

if improvement > 5:
    print(f"\nSignificant improvement achieved!")
    print(f"  Quantum selection found higher-information samples.")
elif improvement > 0:
    print(f"\nPositive improvement, though modest.")
    print(f"  Consider increasing sample size (N) or adjusting alpha.")
else:
    print(f"\nNo improvement detected.")
    print(f"  Possible issues:")
    print(f"  - QUBO parameters need tuning (alpha, gamma)")
    print(f"  - Sample size too small (try N=1000)")
    print(f"  - Surprise metric may need refinement")

print("\nNext steps:")
print("1. Increase N to 1000 samples")
print("2. Add diversity term to QUBO")
print("3. Train small LM on selected data vs random data")
print("4. Measure downstream task performance")