In [13]:
# ===== Step 0: Install dependencies =====
!pip install -q ckip-transformers transformers torch tqdm pandas

In [14]:

# ===== Step 1: Imports =====
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, AutoTokenizer
from ckip_transformers.nlp import CkipWordSegmenter
from torch.nn import functional as F
from tqdm import tqdm
import math


In [15]:
# ===== Step 2: Load models =====
tokenizer = AutoTokenizer.from_pretrained("ckiplab/gpt2-base-chinese")
model = GPT2LMHeadModel.from_pretrained("ckiplab/gpt2-base-chinese")
ws_driver = CkipWordSegmenter(device=0 if torch.cuda.is_available() else -1)
if torch.cuda.is_available():
    model = model.to("cuda")
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(21128, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=21128, bias=False)
)

In [16]:
# ===== Step 3: Load stimuli =====
stimuli = pd.read_csv("stimuli.csv")  # Must have column "sentence"

In [31]:
# ===== Step 4: Word-level surprisal function =====
def compute_word_surprisal(sentence):
    seg_result = ws_driver([sentence])
    words = seg_result[0]  # full word list
    surprisals = []

    context_words = []
    for word in words:
        context_text = " ".join(context_words)
        context_enc = tokenizer(context_text, return_tensors="pt", add_special_tokens=True)
        context_ids = context_enc["input_ids"]
        if torch.cuda.is_available():
            context_ids = context_ids.to("cuda")

        target_ids = tokenizer(word, add_special_tokens=False)["input_ids"]
        if len(target_ids) == 0:
            continue  # skip empty word

        target_ids = torch.tensor(target_ids, dtype=torch.long).unsqueeze(0)
        if torch.cuda.is_available():
            target_ids = target_ids.to("cuda")

        input_ids = torch.cat([context_ids, target_ids], dim=1)

        with torch.no_grad():
            logits = model(input_ids).logits
            log_probs = F.log_softmax(logits, dim=-1)

        surprisal_sum = 0
        for j in range(target_ids.shape[1]):
            token_idx = context_ids.shape[1] + j - 1
            tid = target_ids[0, j]
            surprisal_sum += -log_probs[0, token_idx, tid].item() / math.log(2)

        surprisals.append(surprisal_sum)
        context_words.append(word)

    return words, surprisals


In [32]:
# ===== Step 5: Compute surprisal for all sentences =====
all_rows = []

for i, row in tqdm(stimuli.iterrows(), total=len(stimuli), desc="Processing sentences"):
    sentence = str(row["sentence"]).strip()
    if not sentence:
        continue
    try:
        words, surprisals = compute_word_surprisal(sentence)
        for w, s in zip(words, surprisals):
            all_rows.append({
                "sentence_id": i,
                "sentence": sentence,
                "word": w,
                "surprisal": s
            })
    except Exception as e:
        print(f"⚠️ Skipping row {i} due to error: {e}")


Processing sentences:   0%|          | 0/107 [00:00<?, ?it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 6710.89it/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s][A
Inference: 100%|██████████| 1/1 [00:00<00:00,  8.38it/s]
Processing sentences:   1%|          | 1/107 [00:00<01:24,  1.25it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 5745.62it/s]

Inference: 100%|██████████| 1/1 [00:00<00:00, 11.15it/s]
Processing sentences:   2%|▏         | 2/107 [00:01<01:19,  1.32it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 5769.33it/s]

Inference: 100%|██████████| 1/1 [00:00<00:00, 11.36it/s]
Processing sentences:   3%|▎         | 3/107 [00:02<01:17,  1.35it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 6797.90it/s]

Inference: 100%|██████████| 1/1 [00:00<00:00, 11.38it/s]
Processing sentences:   4%|▎         | 4/107 [00:03<01:16,  1.34it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 5833.52it/s]

Inference: 100%|██████████| 1/1 [00:00<00:00, 11.48it/s]
Proces

In [33]:
# ===== Step 6: Save CSV =====
df_out = pd.DataFrame(all_rows)
df_out.to_csv("stimuli_word_surprisal.csv", index=False, encoding="utf-8-sig")
print("✅ Done! CSV saved as 'stimuli_word_surprisal.csv'")


# ===== Step 7: Sanity check print =====
print(df_out.head(10))

✅ Done! CSV saved as 'stimuli_word_surprisal.csv'
   sentence_id  sentence word  surprisal
0            0  小明買了一杯咖啡   小明  20.341637
1            0  小明買了一杯咖啡    買  13.261948
2            0  小明買了一杯咖啡    了   6.256403
3            0  小明買了一杯咖啡    一   6.303586
4            0  小明買了一杯咖啡    杯  11.824003
5            0  小明買了一杯咖啡   咖啡   9.538607
6            1  小明喝了一杯奶茶   小明  20.341637
7            1  小明喝了一杯奶茶    喝  16.194701
8            1  小明喝了一杯奶茶    了   5.740407
9            1  小明喝了一杯奶茶    一   6.609021
