In [None]:
!pip install https://github.com/kpu/kenlm/archive/master.zip

In [None]:
import os
import kenlm

In [None]:
!apt-get update -y
!apt-get install -y build-essential cmake libboost-all-dev libeigen3-dev

In [None]:
!wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
!mkdir kenlm/build
%cd /content/kenlm/build
!cmake ..
!make -j2

In [None]:
!ls /content/kenlm/build/bin

In [None]:
!pip install datasets==3.6.0

In [None]:
from datasets import load_dataset
ds = load_dataset("Skylion007/openwebtext",split="train")

In [None]:
# only using part of the dataset, current 2M paragraphs
max_docs = 2000000

with open("/content/drive/MyDrive/llm_eliciture/openwebtext_subset.txt", "w") as f:
  for i, item in enumerate(ds):
    if i>=max_docs:
      break
    text = item["text"]
    text = " ".join(text.split()) # combine the words in to sentences.
    f.write(text + "\n") # \n for paragraphs

In [None]:
# using a subset of oopoenwebtext (2M words) to prevent it from being killed again
!/content/kenlm/build/bin/lmplz \
--order 1 \
-S 6G \
--discount_fallback \
--skip_symbols \
< "/content/drive/MyDrive/llm_eliciture/openwebtext_subset.txt" > "/content/drive/MyDrive/llm_eliciture/3gram.arpa"

In [None]:
!/content/kenlm/build/bin/build_binary \
  "/content/drive/MyDrive/llm_eliciture/3gram.arpa" \
  "/content/drive/MyDrive/llm_eliciture/3gram.bin"

In [None]:
from datasets import load_dataset
ds = load_dataset("Skylion007/openwebtext",split="train")

In [None]:
import math

def nats(log10p):
  return -log10p * math.log(10)

def bits(log10p):
  return -log10p / math.log10(2)

In [None]:
# this is using the unigram to get the unigram surpsial
# alternatvely, could use the bigram + backoff
unigrams = {}

with open("/content/drive/MyDrive/llm_eliciture/2gram.arpa", "r") as f:
  in_unigrams = False
  for line in f:
    line = line.strip()

    if line == "\\1-grams:":
      in_unigrams = True
      continue
    if line.startswith("\\2-grams:"):
      break

    if in_unigrams and line:
      parts = line.split()
      logp = float(parts[0])
      word = parts[1]
      unigrams[word] = logp

In [None]:
# load the 2gram and 3gram models
m2 = kenlm.Model("/content/drive/MyDrive/llm_eliciture/2gram.bin")
m3 = kenlm.Model("/content/drive/MyDrive/llm_eliciture/3gram.bin")

In [None]:
def region_surprisal(region):
  # tokens = region.replace("_", " ").split()
  region_length = len(region.split())

  if region_length == 1:
    return nats(unigrams[region])
  elif region_length == 2:
    scores = list(m2.full_scores(region,bos=True,eos=False))
    return nats(scores[0][0]+scores[1][0])
  elif region_length == 3:
    scores = list(m3.full_scores(region,bos=True,eos=False))
    return nats(scores[0][0]+scores[1][0]+scores[2][0])
  else:
    raise ValueError("regions need to be shorter than 3 words")

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/llm_eliciture/eliciture_regions.csv")

df["region_word"] = df["word"].apply(lambda x: x.replace("_", " "))
df["region_surprisal_nats"] = df["region_word"].apply(region_surprisal)

In [None]:
df.to_csv("/content/drive/MyDrive/llm_eliciture/eliciture_regions_surprisal.csv", index=False)