# PPA Word Embeddings — Test and Visualize Embeddings with Altair
August 17, 2025  
MW

In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from itertools import islice
import json
import gzip
import pandas as pd
import re
import altair as alt
import numpy as np
import string
import random
import umap


# Extract contextual embeddings for each word/token

Define function to extract contextual embeddings for each word/token

MW note: With the previous code, we were including punctuation at the end of tokens, like "ballad." or "ballad,". I made an adjust that should fix it.

In [2]:
def extract_usage_representations(text, tokenizer, model, device="cpu", skip_stopwords=True):
    encoded = tokenizer(
        text,
        return_tensors="pt",
        return_offsets_mapping=True,
        truncation=True
    )
    
    input_ids = encoded["input_ids"].to(device)
    attention_mask = encoded["attention_mask"].to(device)
    offsets = encoded["offset_mapping"][0]

    with torch.no_grad():
        output = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )
        hidden_states = output.hidden_states  # (layers, batch, seq_len, hidden_size)

    all_layers = torch.stack(hidden_states, dim=0)  # (layers, batch, seq_len, hidden)
    summed = all_layers.sum(dim=0)[0]  # (seq_len, hidden_size)

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    special_tokens = set(tokenizer.all_special_tokens)

    usage_vectors = []
    current_word = ""
    current_vecs = []
    current_start = None
    current_end = None

    STOPWORDS = {
       "the", "and", "for", "but", "with", "that", "this", "from", "not",
      "you", "are", "was", "were", "have", "has", "had", "she", "he", "they",
       "his", "her", "its", "our", "their", "will", "would", "can", "could"
    }

    for i, token in enumerate(tokens):
        # skip special tokens
        if token in special_tokens:
            continue

        # detect new word
        is_new_word = token.startswith("Ġ") or i == 0

        if is_new_word and current_word:
            # aggregate previous word
            word_vec = torch.stack(current_vecs).mean(dim=0)

            # strip punctuation + lowercase
            clean_word = current_word.lower().strip(string.punctuation)

            if clean_word and (not skip_stopwords or clean_word not in STOPWORDS):
                usage_vectors.append({
                    "word": clean_word,
                    "vector": word_vec.cpu(),
                    "char_start": current_start,
                    "char_end": current_end
                })

            current_vecs = []

        if is_new_word:
            current_word = token.lstrip("Ġ")
            current_start = offsets[i][0].item()
            current_end = offsets[i][1].item()
            current_vecs.append(summed[i])
        else:
            current_word += token
            current_end = offsets[i][1].item()
            current_vecs.append(summed[i])

    # Handle last word
    if current_word:
        word_vec = torch.stack(current_vecs).mean(dim=0)
        clean_word = current_word.lower().strip(string.punctuation)
        if clean_word and (not skip_stopwords or clean_word not in STOPWORDS):
            usage_vectors.append({
                "word": clean_word,
                "vector": word_vec.cpu(),
                "char_start": current_start,
                "char_end": current_end
            })

    return usage_vectors

# Unzip each PPA page and load the JSON data

In [3]:
def page_iter(pages_file):
   # Yield pages one at a time from gzipped JSON lines file for memory efficiency
   with gzip.open(pages_file, 'rt', encoding='utf-8') as fh:
       for line in fh:
           yield json.loads(line)



# Load ModernBERT tokenizer and model

In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)



# Test on N Pages

I made an adjustment to keep all metadata fields, like title, author, and publication date, so we can include them in analysis and testing. It's hard for me to tell whether something makes sense otherwise.

In [5]:
TARGET_COLLECTIONS = {"Literary", "Linguistic"}
with open("Data/ppa_corpus_2025-02-03_1308/ppa_metadata.json") as f:
    metadata = json.load(f)

# index by work_id, but keep full entry
metadata_index = {
    entry["work_id"]: entry
    for entry in metadata
    if "collections" in entry and any(c in TARGET_COLLECTIONS for c in entry["collections"])
}

# Parameters to tweak

In [6]:
CORPUS_PATH = "Data/ppa_corpus_2025-02-03_1308/ppa_pages.jsonl.gz"

NUM_PAGES_TO_PROCESS = 250          # set higher once everything works
CONTEXT_PADDING = 75                # chars on each side in the context snippet
TARGET_WORDS = None                 # e.g., {"ballad", "sonnet"} or None for all


In [7]:
# === Text helpers ===
def normalize_whitespace(text: str) -> str:
    return re.sub(r"\s+", " ", text).strip()

def create_context_snippet(full_text: str, char_start: int, char_end: int, pad: int = CONTEXT_PADDING) -> str:
    """Return a compact context snippet with the span in **bold**."""
    if not isinstance(full_text, str) or full_text == "":
        return ""
    if not isinstance(char_start, int) or not isinstance(char_end, int):
        # Char offsets might come in as numpy types
        try:
            char_start = int(round(char_start))
            char_end = int(round(char_end))
        except Exception:
            return ""

    text_len = len(full_text)
    char_start = max(0, min(char_start, text_len))
    char_end   = max(0, min(char_end, text_len))
    if not (0 <= char_start < char_end <= text_len):
        return ""

    snippet_start = max(0, char_start - pad)
    snippet_end   = min(text_len, char_end + pad)

    text_before = normalize_whitespace(full_text[snippet_start:char_start])
    text_middle = normalize_whitespace(full_text[char_start:char_end])
    text_after  = normalize_whitespace(full_text[char_end:snippet_end])

    return f"{text_before} **{text_middle}** {text_after}"



# Process Pages and Grab Embeddings

Try random sample so it's not all 1500s and EEBO

In [8]:
extracted_records = []

# load everything into a list
all_pages = list(page_iter(CORPUS_PATH))

# choose N random pages
sampled_pages = random.sample(all_pages,  NUM_PAGES_TO_PROCESS)

# iterate by random sample of N pages
for page_record in sampled_pages:

# Iterate by first N number of pages
# for page_record in islice(page_iter(CORPUS_PATH), NUM_PAGES_TO_PROCESS):
    page_text = page_record.get("text")
    page_id = page_record.get("id")
    work_id = page_record.get("work_id")

    if not page_text or work_id not in metadata_index:
        continue

    work_meta = metadata_index[work_id]  # dict with title, author, pub_year, etc.

    for usage in extract_usage_representations(page_text, tokenizer, model, device=DEVICE):
        word = usage.get("word")
        if TARGET_WORDS is not None and word not in TARGET_WORDS:
            continue

        char_start = usage.get("char_start")
        char_end   = usage.get("char_end")

        context_snippet = create_context_snippet(page_text, char_start, char_end, pad=CONTEXT_PADDING)

        # Build record: base word info + flattened metadata
        record = {
            "word": word,
            "usage_vector": (usage.get("vector").tolist()
                             if hasattr(usage.get("vector"), "tolist")
                             else usage.get("vector")),
            "char_start": char_start,
            "char_end": char_end,
            "id": page_id,
            "work_id": work_id,
            "context_snippet": context_snippet,
        }

        # Merge in metadata fields
        record.update({
            "title": work_meta.get("title"),
            "author": work_meta.get("author"),
            "pub_year": work_meta.get("pub_year"),
            "pub_place": work_meta.get("pub_place"),
            "publisher": work_meta.get("publisher"),
            "source_id": work_meta.get("source_id"),
            "source_url": work_meta.get("source_url"),
            "source": work_meta.get("source"),
            "work_type": work_meta.get("work_type"),
            "collections": work_meta.get("collections"),
            "cluster_id": work_meta.get("cluster_id"),
        })

        extracted_records.append(record)

Make dataframe

In [9]:
word_usage_df = pd.DataFrame(extracted_records)


In [10]:
word_usage_df

Unnamed: 0,word,usage_vector,char_start,char_end,id,work_id,context_snippet,title,author,pub_year,pub_place,publisher,source_id,source_url,source,work_type,collections,cluster_id
0,place,"[-60.580474853515625, 55.52336883544922, 7.747...",3.0,9,mdp.39015021774883.00000011,mdp.39015021774883,THE **PLACE** OF LITERATURE. 7 contemplation o...,Literature in school:,"Scudder, Horace Elisha, 1838-1902",1888,Boston,"Houghton, Mifflin and company",mdp.39015021774883,https://hdl.handle.net/2027/mdp.39015021774883,HathiTrust,full-work,[Linguistic],mdp.39015021774883
1,of,"[-17.78848648071289, -35.76409912109375, -16.6...",9.0,12,mdp.39015021774883.00000011,mdp.39015021774883,THE PLACE **OF** LITERATURE. 7 contemplation o...,Literature in school:,"Scudder, Horace Elisha, 1838-1902",1888,Boston,"Houghton, Mifflin and company",mdp.39015021774883,https://hdl.handle.net/2027/mdp.39015021774883,HathiTrust,full-work,[Linguistic],mdp.39015021774883
2,literature.ċ7ċcontemplation,"[-46.980201721191406, -22.354145050048828, -10...",12.0,40,mdp.39015021774883.00000011,mdp.39015021774883,THE PLACE OF **LITERATURE. 7 contemplation** o...,Literature in school:,"Scudder, Horace Elisha, 1838-1902",1888,Boston,"Houghton, Mifflin and company",mdp.39015021774883,https://hdl.handle.net/2027/mdp.39015021774883,HathiTrust,full-work,[Linguistic],mdp.39015021774883
3,of,"[29.396236419677734, -44.41532897949219, 61.92...",40.0,43,mdp.39015021774883.00000011,mdp.39015021774883,THE PLACE OF LITERATURE. 7 contemplation **of*...,Literature in school:,"Scudder, Horace Elisha, 1838-1902",1888,Boston,"Houghton, Mifflin and company",mdp.39015021774883,https://hdl.handle.net/2027/mdp.39015021774883,HathiTrust,full-work,[Linguistic],mdp.39015021774883
4,those,"[61.783241271972656, 4.421317100524902, -34.86...",43.0,49,mdp.39015021774883.00000011,mdp.39015021774883,THE PLACE OF LITERATURE. 7 contemplation of **...,Literature in school:,"Scudder, Horace Elisha, 1838-1902",1888,Boston,"Houghton, Mifflin and company",mdp.39015021774883,https://hdl.handle.net/2027/mdp.39015021774883,HathiTrust,full-work,[Linguistic],mdp.39015021774883
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56443,at,"[-49.87699890136719, -67.54403686523438, 50.85...",1221.0,1224,mdp.49015000900655.00000197,mdp.49015000900655,"calling us by name, Struggling with failing br...",The science and art of elocution;,"Fenno, Frank Honywell, 1857-",1878,Philadelphia,J. E. Porter & company,mdp.49015000900655,https://hdl.handle.net/2027/mdp.49015000900655,HathiTrust,full-work,"[Linguistic, Typographically Unique]",fennoscience
56444,postċwhere,"[-47.702911376953125, -55.42826461791992, -53....",1228.0,1239,mdp.49015000900655.00000197,mdp.49015000900655,"g us by name, Struggling with failing breath T...",The science and art of elocution;,"Fenno, Frank Honywell, 1857-",1878,Philadelphia,J. E. Porter & company,mdp.49015000900655,https://hdl.handle.net/2027/mdp.49015000900655,HathiTrust,full-work,"[Linguistic, Typographically Unique]",fennoscience
56445,glory,"[-78.4246826171875, -5.383387565612793, -28.63...",1239.0,1245,mdp.49015000900655.00000197,mdp.49015000900655,"e, Struggling with failing breath To keep thei...",The science and art of elocution;,"Fenno, Frank Honywell, 1857-",1878,Philadelphia,J. E. Porter & company,mdp.49015000900655,https://hdl.handle.net/2027/mdp.49015000900655,HathiTrust,full-work,"[Linguistic, Typographically Unique]",fennoscience
56446,strove,"[-15.757892608642578, -37.12873077392578, 54.3...",1245.0,1252,mdp.49015000900655.00000197,mdp.49015000900655,uggling with failing breath To keep their ship...,The science and art of elocution;,"Fenno, Frank Honywell, 1857-",1878,Philadelphia,J. E. Porter & company,mdp.49015000900655,https://hdl.handle.net/2027/mdp.49015000900655,HathiTrust,full-work,"[Linguistic, Typographically Unique]",fennoscience


# Transform embeddings to two dimensions so we can plot and compare them

In [11]:
# 1. pick out rows with real vectors
valid_indices = [i for i, v in word_usage_df["usage_vector"].items() if isinstance(v, (list, np.ndarray))]

# 2. stack them into a 2D array
vectors = [np.asarray(word_usage_df.at[i, "usage_vector"], dtype=float) for i in valid_indices]
vector_matrix = np.stack(vectors)

# 3. run UMAP
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine", random_state=42)
coords_2d = reducer.fit_transform(vector_matrix)   # shape (len(valid_indices), 2)

# 4. assign back into dataframe
word_usage_df.loc[valid_indices, "x"] = coords_2d[:, 0]
word_usage_df.loc[valid_indices, "y"] = coords_2d[:, 1]

  warn(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
word_usage_df

Unnamed: 0,word,usage_vector,char_start,char_end,id,work_id,context_snippet,title,author,pub_year,pub_place,publisher,source_id,source_url,source,work_type,collections,cluster_id,x,y
0,place,"[-60.580474853515625, 55.52336883544922, 7.747...",3.0,9,mdp.39015021774883.00000011,mdp.39015021774883,THE **PLACE** OF LITERATURE. 7 contemplation o...,Literature in school:,"Scudder, Horace Elisha, 1838-1902",1888,Boston,"Houghton, Mifflin and company",mdp.39015021774883,https://hdl.handle.net/2027/mdp.39015021774883,HathiTrust,full-work,[Linguistic],mdp.39015021774883,0.717142,1.577780
1,of,"[-17.78848648071289, -35.76409912109375, -16.6...",9.0,12,mdp.39015021774883.00000011,mdp.39015021774883,THE PLACE **OF** LITERATURE. 7 contemplation o...,Literature in school:,"Scudder, Horace Elisha, 1838-1902",1888,Boston,"Houghton, Mifflin and company",mdp.39015021774883,https://hdl.handle.net/2027/mdp.39015021774883,HathiTrust,full-work,[Linguistic],mdp.39015021774883,19.264975,4.040833
2,literature.ċ7ċcontemplation,"[-46.980201721191406, -22.354145050048828, -10...",12.0,40,mdp.39015021774883.00000011,mdp.39015021774883,THE PLACE OF **LITERATURE. 7 contemplation** o...,Literature in school:,"Scudder, Horace Elisha, 1838-1902",1888,Boston,"Houghton, Mifflin and company",mdp.39015021774883,https://hdl.handle.net/2027/mdp.39015021774883,HathiTrust,full-work,[Linguistic],mdp.39015021774883,0.830571,1.336475
3,of,"[29.396236419677734, -44.41532897949219, 61.92...",40.0,43,mdp.39015021774883.00000011,mdp.39015021774883,THE PLACE OF LITERATURE. 7 contemplation **of*...,Literature in school:,"Scudder, Horace Elisha, 1838-1902",1888,Boston,"Houghton, Mifflin and company",mdp.39015021774883,https://hdl.handle.net/2027/mdp.39015021774883,HathiTrust,full-work,[Linguistic],mdp.39015021774883,-1.072388,13.947363
4,those,"[61.783241271972656, 4.421317100524902, -34.86...",43.0,49,mdp.39015021774883.00000011,mdp.39015021774883,THE PLACE OF LITERATURE. 7 contemplation of **...,Literature in school:,"Scudder, Horace Elisha, 1838-1902",1888,Boston,"Houghton, Mifflin and company",mdp.39015021774883,https://hdl.handle.net/2027/mdp.39015021774883,HathiTrust,full-work,[Linguistic],mdp.39015021774883,2.899079,11.380541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56443,at,"[-49.87699890136719, -67.54403686523438, 50.85...",1221.0,1224,mdp.49015000900655.00000197,mdp.49015000900655,"calling us by name, Struggling with failing br...",The science and art of elocution;,"Fenno, Frank Honywell, 1857-",1878,Philadelphia,J. E. Porter & company,mdp.49015000900655,https://hdl.handle.net/2027/mdp.49015000900655,HathiTrust,full-work,"[Linguistic, Typographically Unique]",fennoscience,4.210336,12.682718
56444,postċwhere,"[-47.702911376953125, -55.42826461791992, -53....",1228.0,1239,mdp.49015000900655.00000197,mdp.49015000900655,"g us by name, Struggling with failing breath T...",The science and art of elocution;,"Fenno, Frank Honywell, 1857-",1878,Philadelphia,J. E. Porter & company,mdp.49015000900655,https://hdl.handle.net/2027/mdp.49015000900655,HathiTrust,full-work,"[Linguistic, Typographically Unique]",fennoscience,1.935579,-0.815725
56445,glory,"[-78.4246826171875, -5.383387565612793, -28.63...",1239.0,1245,mdp.49015000900655.00000197,mdp.49015000900655,"e, Struggling with failing breath To keep thei...",The science and art of elocution;,"Fenno, Frank Honywell, 1857-",1878,Philadelphia,J. E. Porter & company,mdp.49015000900655,https://hdl.handle.net/2027/mdp.49015000900655,HathiTrust,full-work,"[Linguistic, Typographically Unique]",fennoscience,6.181747,1.575766
56446,strove,"[-15.757892608642578, -37.12873077392578, 54.3...",1245.0,1252,mdp.49015000900655.00000197,mdp.49015000900655,uggling with failing breath To keep their ship...,The science and art of elocution;,"Fenno, Frank Honywell, 1857-",1878,Philadelphia,J. E. Porter & company,mdp.49015000900655,https://hdl.handle.net/2027/mdp.49015000900655,HathiTrust,full-work,"[Linguistic, Typographically Unique]",fennoscience,4.311954,0.501458


In [13]:
pd.options.display.max_rows = 400

In [14]:
word_usage_df['word'].value_counts()[:300]

word
of           2434
to           1599
a            1411
in           1363
is            817
as            547
it            529
i             478
or            437
be            436
by            391
which         374
ċ             280
an            243
on            237
all           232
we            224
one           216
at            204
ċġċ           165
him           150
may           149
who           136
what          128
s             127
more          126
when          121
so            121
if            120
no            120
my            110
other         110
there         106
them          102
been          101
any            99
into           93
words          91
ii             88
us             82
me             79
man            77
such           76
only           76
thou           75
v              75
these          74
than           74
do             73
great          73
two            73
some           72
word           71
like           70
now            69
first

In [15]:
alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

In [16]:
# Avoid special 'title' key in tooltip
# word_usage_df = word_usage_df.rename(columns={"work_title": "title"})


In [None]:
chart = (
    alt.Chart(word_usage_df[:25000], title=f"PPA Word Embeddings ({NUM_PAGES_TO_PROCESS} Pages)")
    .mark_circle(size=60, opacity=0.8)
    .encode(

        x=alt.X("x:Q", scale=alt.Scale(zero=False)),
        y=alt.Y("y:Q", scale=alt.Scale(zero=False)),

        color="word:N",  

        tooltip=[
            alt.Tooltip("word:N"),
            alt.Tooltip("title"),
            alt.Tooltip("author"),
            alt.Tooltip("pub_year"),
            alt.Tooltip("id:N"),
             alt.Tooltip("context_snippet"),
            alt.Tooltip("work_id:N"),
            alt.Tooltip("char_start:Q"),
            alt.Tooltip("char_end:Q"),
        ],
    )
    .interactive()
    .properties(width=600, height=600)
)

chart

In [49]:
chart.save(f'PPA-{NUM_PAGES_TO_PROCESS}-random-pages.html')


# Selected Words

In [42]:
# selected_words = "ballad|ballade|ballades|epic|\bode\b|\bodes\b|sonnet|poem|poetry|haiku|prose|verse|villanelle"
selected_words = "ballad|ballade|ballades|epic|\bode\b|\bodes\b|sonnet|haiku|villanelle"

In [43]:
selected_words_df = word_usage_df[word_usage_df['word'].str.contains(selected_words)]

In [44]:
selected_words_df[:100]

Unnamed: 0,word,usage_vector,char_start,char_end,id,work_id,context_snippet,title,author,pub_year,pub_place,publisher,source_id,source_url,source,work_type,collections,cluster_id,x,y
10109,ballads,"[6.527761936187744, -18.135896682739258, 51.18...",61.0,69,uc1.b3861089.00000502,uc1.b3861089,"470 SONG, SENTIMENT, AND FANCY. Toru Dutt: 185...",Victorian poets;,"Stedman, Edmund Clarence, 1833-1908",1915,Boston,"Houghton, Mifflin",uc1.b3861089,https://hdl.handle.net/2027/uc1.b3861089,HathiTrust,full-work,[Literary],stedmanvictorian,3.414447,1.680198
10263,balladist,"[-33.23130798339844, 6.127866268157959, 22.657...",1334.0,1345,uc1.b3861089.00000502,uc1.b3861089,he world were colder. The later songs of Marzi...,Victorian poets;,"Stedman, Edmund Clarence, 1833-1908",1915,Boston,"Houghton, Mifflin",uc1.b3861089,https://hdl.handle.net/2027/uc1.b3861089,HathiTrust,full-work,[Literary],stedmanvictorian,3.6459,0.861837
24649,"hastings,ċballade","[11.652135848999023, 10.932976722717285, 113.0...",65.0,83,CW0116299725.0323,CW0116299725,"THE POETICAL WORKS Ов THOMAS CHATTERTON. ELLA,...",A complete edition of the poets of Great Brita...,,1792,London,"John and Arthur Arch, and for Bell and Bradfut...",CW0116299725,https://link.gale.com/apps/doc/CW0116299725/EC...,Gale,full-work,[Literary],CW0116299725,13.836024,3.527303
26446,sonnets,"[-46.92252731323242, 37.66505813598633, -23.86...",1588.0,1597,mdp.39015017644959.00000630,mdp.39015017644959,"chnical point of view, may be counted irreproa...",The English poets;,"Ward, Thomas Humphry, 1845-1926",1883,London,Macmillan and co.,mdp.39015017644959,https://hdl.handle.net/2027/mdp.39015017644959,HathiTrust,full-work,[Literary],wardenglish4,2.903377,1.004287
26452,herċsonnets,"[-1.5658226013183594, -27.013193130493164, -25...",1632.0,1644,mdp.39015017644959.00000630,mdp.39015017644959,"oachable, may, if we except the Sonnets, almos...",The English poets;,"Ward, Thomas Humphry, 1845-1926",1883,London,Macmillan and co.,mdp.39015017644959,https://hdl.handle.net/2027/mdp.39015017644959,HathiTrust,full-work,[Literary],wardenglish4,2.987388,0.91464
26467,sonnetsċfrom,"[-24.603464126586914, -16.128515243530273, -20...",1756.0,1769,mdp.39015017644959.00000630,mdp.39015017644959,produced. Perhaps indeed her greatest poetic s...,The English poets;,"Ward, Thomas Humphry, 1845-1926",1883,London,Macmillan and co.,mdp.39015017644959,https://hdl.handle.net/2027/mdp.39015017644959,HathiTrust,full-work,[Literary],wardenglish4,2.92155,0.836014
26468,"portuguese,-sonnets","[-52.13423538208008, 31.61355972290039, -44.25...",1773.0,1794,mdp.39015017644959.00000630,mdp.39015017644959,s indeed her greatest poetic success is to be ...,The English poets;,"Ward, Thomas Humphry, 1845-1926",1883,London,Macmillan and co.,mdp.39015017644959,https://hdl.handle.net/2027/mdp.39015017644959,HathiTrust,full-work,[Literary],wardenglish4,2.912977,0.977733
26487,theċ'sonnet's,"[-27.3944091796875, -26.132200241088867, -32.6...",1951.0,1965,mdp.39015017644959.00000630,mdp.39015017644959,aintly disguised presentment of the writer's m...,The English poets;,"Ward, Thomas Humphry, 1845-1926",1883,London,Macmillan and co.,mdp.39015017644959,https://hdl.handle.net/2027/mdp.39015017644959,HathiTrust,full-work,[Literary],wardenglish4,2.955377,0.301105
29970,epic,"[-70.58683776855469, -11.152405738830566, -31....",226.0,231,njp.32101013516610.00000205,njp.32101013516610,"sculpture, and regards the images of statesmen...",Specimens of the forms of discourse.,"Lewis, Edwin Herbert, 1866-1938",1900,New York,H. Holt and company,njp.32101013516610,https://hdl.handle.net/2027/njp.32101013516610,HathiTrust,full-work,"[Linguistic, Literary]",njp.32101013516610,4.185784,1.388377
34666,epic,"[-6.0251970291137695, 18.181289672851562, -7.3...",123.0,128,njp.32101055576415.00000176,njp.32101055576415,"is is correct, the first syllable in ἱερὸν is ...",A treatise on Greek tragic metres:,"Linwood, William, 1817-1878",1855,London,"Longman, Brown, Green and Longmans",njp.32101055576415,https://hdl.handle.net/2027/njp.32101055576415,HathiTrust,full-work,[Literary],njp.32101055576415,3.134956,1.368462


In [45]:
chart2 = (
    alt.Chart(selected_words_df, title="PPA Word Usage (Sonnet, Ballad, Epic)")
    .mark_circle(size=60, opacity=0.8)
    .encode(
        x=alt.X("x:Q", scale=alt.Scale(zero=False)),
        y=alt.Y("y:Q", scale=alt.Scale(zero=False)),
        color="word:N",  # or cluster labels if too many unique words
        tooltip=[
            alt.Tooltip("title:N"),

            alt.Tooltip("word:N"),
            alt.Tooltip("context_snippet"),
            alt.Tooltip("pub_year"),
             alt.Tooltip("author"),

            # alt.Tooltip("id:N"),
            # alt.Tooltip("work_id:N"),
            # alt.Tooltip("char_start:Q"),
            # alt.Tooltip("char_end:Q"),
        ],
    )
    .interactive()
    .properties(width=600, height=600)
)

chart2

In [46]:
chart2.save('PPA-random-pages_sonnet-ballad-epic.html')
