In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from itertools import islice
import json
import gzip
import pandas as pd
import re
import string
import numpy as np

In [4]:
# df = pd.read_parquet("../Data/ppa_corpus_2025-02-03_1308/keywords_and_top_1000.parquet")
df = pd.read_csv("../Data/ppa_corpus_2025-02-03_1308/ppa_keyword_results_1000.csv")


In [5]:
df

Unnamed: 0,page_id,work_id,poetic_form,top_1000_word,counts,contexts,page_text,spelling
0,A01224.1,A01224,,right,1,,,
1,A01224.1,A01224,,excellent,1,,,
2,A01224.1,A01224,,french,1,,,
3,A01224.1,A01224,,non,3,,,
4,A01224.1,A01224,,latin,3,,,
...,...,...,...,...,...,...,...,...
45429237,yale.39002088447587.00000496,yale.39002088447587,,words,1,,,
45429238,yale.39002088447587.00000496,yale.39002088447587,,phrases,1,,,
45429239,yale.39002088447587.00000496,yale.39002088447587,,correct,1,,,
45429240,yale.39002088447587.00000496,yale.39002088447587,,peace,1,,,


In [6]:
df['poetic_form'] = np.where(
    df['poetic_form'].isna(),
    df['top_1000_word'],
    df['poetic_form']
)

df['spelling'] = np.where(
    df['spelling'].isna(),
    df['top_1000_word'],
    df['spelling']
)

In [7]:
df.to_parquet("../Data/ppa_corpus_2025-02-03_1308/ppa_keyword_results_1000_edited.parquet")

In [None]:
def extract_usage_representations(text, tokenizer, model, device="cpu", skip_stopwords=True):
    encoded = tokenizer(
        text,
        return_tensors="pt",
        return_offsets_mapping=True,
        truncation=True
    )
    
    input_ids = encoded["input_ids"].to(device)
    attention_mask = encoded["attention_mask"].to(device)
    offsets = encoded["offset_mapping"][0]

    with torch.no_grad():
        output = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )
        hidden_states = output.hidden_states  # (layers, batch, seq_len, hidden_size)

    all_layers = torch.stack(hidden_states, dim=0)  # (layers, batch, seq_len, hidden)
    summed = all_layers.sum(dim=0)[0]  # (seq_len, hidden_size)

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    special_tokens = set(tokenizer.all_special_tokens)

    usage_vectors = []
    current_word = ""
    current_vecs = []
    current_start = None
    current_end = None

    STOPWORDS = {
       "the", "and", "for", "but", "with", "that", "this", "from", "not",
      "you", "are", "was", "were", "have", "has", "had", "she", "he", "they",
       "his", "her", "its", "our", "their", "will", "would", "can", "could"
    }

    for i, token in enumerate(tokens):
        # skip special tokens
        if token in special_tokens:
            continue

        # detect new word
        is_new_word = token.startswith("Ġ") or i == 0 or token.startswith("Ċ")

        if is_new_word and current_word:
            # aggregate previous word
            word_vec = torch.stack(current_vecs).mean(dim=0)

            # strip punctuation + lowercase
            clean_word = current_word.lower().strip(string.punctuation)

            if clean_word and (not skip_stopwords or clean_word not in STOPWORDS):
                usage_vectors.append({
                    "word": clean_word,
                    "vector": word_vec.cpu(),
                    "char_start": current_start,
                    "char_end": current_end
                })

            current_vecs = []

        if is_new_word:
            current_word = token.lstrip("ĠĊ")
            current_start = offsets[i][0].item()
            current_end = offsets[i][1].item()
            current_vecs.append(summed[i])
        else:
            current_word += token
            current_end = offsets[i][1].item()
            current_vecs.append(summed[i])

    # Handle last word
    if current_word:
        word_vec = torch.stack(current_vecs).mean(dim=0)
        clean_word = current_word.lower().strip(string.punctuation)
        if clean_word and (not skip_stopwords or clean_word not in STOPWORDS):
            usage_vectors.append({
                "word": clean_word,
                "vector": word_vec.cpu(),
                "char_start": current_start,
                "char_end": current_end
            })

    return usage_vectors

In [4]:
def page_iter(pages_file):
   # Yield pages one at a time from gzipped JSON lines file for memory efficiency
   with gzip.open(pages_file, 'rt', encoding='utf-8') as fh:
       for line in fh:
           yield json.loads(line)





In [5]:


from transformers import AutoTokenizer, AutoModelForMaskedLM

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)



Testing the Tokenizer against OCRd text


In [6]:
## Finding a page with some clear OCR errors

TARGET_COLLECTIONS = {"Literary", "Linguistic"}
with open("Data/ppa_corpus_2025-02-03_1308/ppa_metadata.json") as f:
    metadata = json.load(f)

metadata_index = {
    entry["work_id"]: entry for entry in metadata
    if "collections" in entry and any(c in TARGET_COLLECTIONS for c in entry["collections"])

}

for example in tqdm(islice(page_iter("Data/ppa_corpus_2025-02-03_1308/ppa_pages.jsonl.gz"), 10)):
    wid = example.get("work_id")
    if wid in metadata_index:
        print(example)

10it [00:00, 4724.38it/s]

{'label': '[1]', 'tags': ['dedication'], 'id': 'A01224.1', 'order': 1, 'text': '\nTo the Right excellent and most honorable Ladie, the Ladie Marie, Countesse of Pembroke.\nVOi, pia nympha, tuum, quem tolse la morte, Philippū,\nAEdentem llenas coelestis melle palabras.\nItalicum lumen, flowre of Fraunce, splendor Iberus,\nItalicus Tasso, French Salust, Boscan Iberus,\n〈 in non-Latin alphabet 〉 Virgil, 〈 in non-Latin alphabet 〉,\nGreekish Homer, tanto lati iunguntur 〈 in non-Latin alphabet 〉.\nYour Honors most affectionate. Abraham Fraunce.\n\n\n\n\n', 'work_id': 'A01224'}
{'label': '[10]', 'tags': ['book'], 'id': 'A01224.10', 'order': 10, 'text': "Boscan 3. Booke.\nLos altares delante estauan puestos,\nArdiendo encima d'ellos toda Arabia.\n\n\nCap. 5. Of the Metonymia of the adiunct.\nA Metonymia of the adiunct, is, when by the adiunct we expresse the subiect. So the names of vertues & vices are vsed for vertuous or vicious men: the signe for the thing which it doth signifie: the adiū




In [7]:

## found in siddarth's dataframe row 1463

example_text = "The Epi≈øodes of a Tragedy, ought\nto be infinitely fhorter, than thofe of an Epick Poem,\nfor thefe two reafons: The firft is, That Tragedy is\nmuch shorter, fince 'tis confined to one s: The firft is, That Tragedy is\nmuch shorter, fince 'tis confined to one Courfe of the\nSun, but an Epick Poem has no fet time. The fe-\ncond is, becaufe Tragedy is a repre≈øentation, and\nEpick Poem a recita 'e\nSun, but an Epick Poem has no fet time. The fe-\ncond is, becaufe Tragedy is a repre≈øentation, and\nEpick Poem a recitation, and this is the reafon why\nit ought to be extended and amplified by its E-\npifod"



cleaned_text = "The Episodes of a Tragedy, ought to be infinitely shorter, than those of an Epic Poem, for these two reasons: The first is, That Tragedy is much shorter, since 'tis confined to one course of the Sun, but an Epic Poem has no set time. The second is, because Tragedy is a representation, and Epic Poem a recitation, and this is the reason why it ought to be extended and amplified by its Episode"


In [8]:
## what the associated word reconstruction from the clean text looks like

cleaned_list = extract_usage_representations(cleaned_text, tokenizer, model)


cleaned_words = [entry["word"] for entry in cleaned_list]

print(cleaned_words)

['episodes', 'of', 'a', 'tragedy', 'ought', 'to', 'be', 'infinitely', 'shorter', 'than', 'those', 'of', 'an', 'epic', 'poem', 'these', 'two', 'reasons', 'first', 'is', 'tragedy', 'is', 'much', 'shorter', 'since', 'tis', 'confined', 'to', 'one', 'course', 'of', 'sun', 'an', 'epic', 'poem', 'no', 'set', 'time', 'second', 'is', 'because', 'tragedy', 'is', 'a', 'representation', 'epic', 'poem', 'a', 'recitation', 'is', 'reason', 'why', 'it', 'ought', 'to', 'be', 'extended', 'amplified', 'by', 'episode']


In [9]:
## what the associated word reconstruction from the example text looks like

example_list = extract_usage_representations(example_text, tokenizer, model)


example_words = [entry["word"] for entry in example_list]

print(example_words)

['epiâīīã¸odes', 'of', 'a', 'tragedy', 'oughtċto', 'be', 'infinitely', 'fhorter', 'than', 'thofe', 'of', 'an', 'epick', 'poem,ċfor', 'thefe', 'two', 'reafons', 'firft', 'is', 'tragedy', 'isċmuch', 'shorter', 'fince', 'tis', 'confined', 'to', 'one', 's', 'firft', 'is', 'tragedy', 'isċmuch', 'shorter', 'fince', 'tis', 'confined', 'to', 'one', 'courfe', 'of', 'theċsun', 'an', 'epick', 'poem', 'no', 'fet', 'time', 'fe-ċcond', 'is', 'becaufe', 'tragedy', 'is', 'a', 'repreâīīã¸entation', 'andċepick', 'poem', 'a', 'recita', 'eċsun', 'an', 'epick', 'poem', 'no', 'fet', 'time', 'fe-ċcond', 'is', 'becaufe', 'tragedy', 'is', 'a', 'repreâīīã¸entation', 'andċepick', 'poem', 'a', 'recitation', 'is', 'reafon', 'whyċit', 'ought', 'to', 'be', 'extended', 'amplified', 'by', 'e-ċpifod']


In [74]:
cleaned_list[14]

{'word': 'epic',
 'vector': tensor([ 3.4044e+01, -4.8486e+01, -6.2698e+01,  3.9932e+01,  9.5143e+00,
         -8.2894e+01,  3.8096e+01,  5.1511e+00,  5.8480e+01, -3.4456e+01,
          6.3046e+01, -2.3412e+00, -7.3447e+01,  1.0089e+02, -1.1287e+02,
          1.0532e+02, -3.5047e+01, -3.2392e+00, -7.6470e+01, -6.7887e+01,
          4.9302e+01, -2.5757e+01, -7.3687e+00, -1.0590e+02,  1.0167e+01,
          1.4516e+01,  3.6064e+01,  4.9502e+01, -1.0230e+02, -4.6884e+01,
         -8.0251e+00, -4.1092e+02, -9.4452e+00, -1.0868e+02, -4.6609e+00,
         -1.9703e+01, -2.7092e+01, -2.3705e+01, -2.5462e+01,  5.7673e+01,
         -6.6784e+00, -3.5815e+01, -2.6397e+01,  4.8072e+01, -4.7943e+01,
          4.8018e+01, -2.2998e+01,  8.3516e+01, -3.9930e+01,  7.0841e+01,
          1.5261e+01, -4.7520e+01, -2.1284e+01,  5.0030e+01, -3.1565e+01,
         -5.6257e+01,  1.0419e+02, -4.8078e+01,  1.5623e+01,  4.9759e+01,
         -5.9049e+00, -5.6126e+00,  1.6446e+01,  3.6628e+01,  4.0708e+01,
          1

In [None]:
# comparing epic across both vectors

import torch.nn.functional as F
vec1 = example_list[13]["vector"]
vec2 = cleaned_list[14]["vector"]

cos_sim = F.cosine_similarity(vec1.unsqueeze(0), vec2.unsqueeze(0)).item()
print("Cosine similarity:", cos_sim)

### only 90% SIMILARITY, so bad OCR vs good OCR is leading to 10% difference in DIRECTION for the embedding vector

Cosine similarity: 0.9134914875030518


In [77]:


TARGET_COLLECTIONS = {"Literary", "Linguistic"}



In [78]:
import os
os.getcwd()

'/Users/ngupta1/Desktop/GitHub/PPA-Word-Embeddings'

In [79]:

TARGET_COLLECTIONS = {"Literary", "Linguistic"}
with open("Data/ppa_corpus_2025-02-03_1308/ppa_metadata.json") as f:
    metadata = json.load(f)

metadata_index = {
    entry["work_id"]: entry for entry in metadata
    if "collections" in entry and any(c in TARGET_COLLECTIONS for c in entry["collections"])
}

In [80]:
# Process corpus

def is_semantically_meaningful(token):
    token = token.lower()
    STOPWORDS = {
        "the", "and", "for", "but", "with", "that", "this", "from", "not",
        "you", "are", "was", "were", "have", "has", "had", "she", "he", "they",
        "his", "her", "its", "our", "their", "will", "would", "can", "could"
    }
    return (
        token.isalpha() and
        len(token) > 2 and
        token not in STOPWORDS
    )

output = []


# Save output

for example in tqdm(islice(page_iter("Data/ppa_corpus_2025-02-03_1308/ppa_pages.jsonl.gz"), 1000)):
        text = example.get("text")
        pid = example.get("id")
        wid = example.get("work_id")

        if not text or wid not in metadata_index:
            continue
        meta = metadata_index[wid]
        pub_year = meta.get("pub_year")
        collections = meta.get("collections")

        for word_info in extract_usage_representations(text, tokenizer, model, device=DEVICE):
            word = word_info['word']
            output.append({
                "word": word,
                "usage_vector": word_info["vector"].tolist(),
                "char_start": word_info["char_start"],
                "char_end": word_info["char_end"], 
                "id": pid,
                "work_id": wid,
                })






### read in output
## filter down to target words
## remerge 
## BGAK 


1000it [04:08,  4.03it/s]


In [10]:

len(output[0]['usage_vector'])

768

In [53]:
df = pd.DataFrame(output)

In [38]:
for i, example in enumerate(page_iter("Data/ppa_corpus_2025-02-03_1308/ppa_pages.jsonl.gz")):
    print(example)
    if i >= 5:
        break

{'label': '[1]', 'tags': ['dedication'], 'id': 'A01224.1', 'order': 1, 'text': '\nTo the Right excellent and most honorable Ladie, the Ladie Marie, Countesse of Pembroke.\nVOi, pia nympha, tuum, quem tolse la morte, Philippū,\nAEdentem llenas coelestis melle palabras.\nItalicum lumen, flowre of Fraunce, splendor Iberus,\nItalicus Tasso, French Salust, Boscan Iberus,\n〈 in non-Latin alphabet 〉 Virgil, 〈 in non-Latin alphabet 〉,\nGreekish Homer, tanto lati iunguntur 〈 in non-Latin alphabet 〉.\nYour Honors most affectionate. Abraham Fraunce.\n\n\n\n\n', 'work_id': 'A01224'}
{'label': '[10]', 'tags': ['book'], 'id': 'A01224.10', 'order': 10, 'text': "Boscan 3. Booke.\nLos altares delante estauan puestos,\nArdiendo encima d'ellos toda Arabia.\n\n\nCap. 5. Of the Metonymia of the adiunct.\nA Metonymia of the adiunct, is, when by the adiunct we expresse the subiect. So the names of vertues & vices are vsed for vertuous or vicious men: the signe for the thing which it doth signifie: the adiū

In [74]:
def is_semantically_meaningful(token):
    token = token.lower()
    STOPWORDS = {
        "the", "and", "for", "but", "with", "that", "this", "from", "not",
        "you", "are", "was", "were", "have", "has", "had", "she", "he", "they",
        "his", "her", "its", "our", "their", "will", "would", "can", "could"
    }
    return (
        token.isalpha() and
        len(token) > 2 and
        token not in STOPWORDS
    )


total_semantic_word_count = 0
for i, line in enumerate(page_iter("Data/ppa_corpus_2025-02-03_1308/ppa_pages.jsonl.gz")):
    if 'text' in line and line['text']:
        tokens = line['text'].split()
        for token in tokens:
            if is_semantically_meaningful(token):
                total_semantic_word_count += 1

print(total_semantic_word_count)

248308639


In [81]:
import json

size_bytes = sum(len(json.dumps(item).encode("utf-8")) + 1 for item in output)  # +1 for newline
size_mb = size_bytes / (1024 * 1024)

print(f"Estimated JSONL size: {size_mb:.2f} MB")

Estimated JSONL size: 3618.22 MB


In [17]:
all_pages = 0
for i, line in enumerate(page_iter("Data/ppa_corpus_2025-02-03_1308/ppa_pages.jsonl.gz")):
    if line['work_id'] in metadata_index:
        all_pages += 1

all_pages

1939462