In [54]:

import json
import gzip
from io import BytesIO
import boto3
import random
import pandas as pd
from tqdm import tqdm
import numpy as np
import altair as alt
import ast


In [55]:
df_embeddings = pd.read_csv(
    "embeddings.csv",
    converters={"embedding": json.loads}  # slightly faster than ast.literal_eval
)

In [None]:
import umap
#2d projection

valid_indices = df_embeddings.index
# 2. stack them into a 2D array
vectors = [np.asarray(df_embeddings.at[i, "embedding"], dtype=float) for i in valid_indices]
vector_matrix = np.stack(vectors)


reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine", random_state=1)
coords_2d = reducer.fit_transform(vector_matrix)   

# 4. assign back into dataframe
df_embeddings.loc[valid_indices, "x"] = coords_2d[:, 0]
df_embeddings.loc[valid_indices, "y"] = coords_2d[:, 1]

  warn(


In [None]:
import altair as alt
from IPython.display import display

#generate charts



# Maximum number of points to plot per form
MAX_POINTS = 2000

# Get all unique forms
forms = df_embeddings["form"].unique()
forms = ["Epic", "Ode", "Hymn", "Ballad", "Sonnet", "Pastoral", "Elegy"]

for form in forms:
    # Filter for this form
    subset = df_embeddings[df_embeddings["form"] == form]

    # Sample if too many points
    if len(subset) > MAX_POINTS:
        subset = subset.sample(n=MAX_POINTS, random_state=42)

    chart = (
        alt.Chart(subset, title=f"PPA Word Embeddings: '{form}'")
        .mark_circle(size=60, opacity=0.8)
        .encode(
            x=alt.X("x:Q", scale=alt.Scale(zero=False)),
            y=alt.Y("y:Q", scale=alt.Scale(zero=False)),
            color=alt.Color(
                "pub_year:Q",
                scale=alt.Scale(scheme="blues", reverse=True),  # lighter→darker
                legend=alt.Legend(title="Publication Year")
            ),
            tooltip=[
                "form:N", "title", "author", "pub_year:Q",
                "context", "work_id:N", "char_start:Q", "char_end:Q", "cluster:N"
            ],
        )
        .interactive()
        .properties(width=600, height=600)
    )
    
    display(chart)
    chart.save(f'PPA-embeddings-2k-{form}.html')


In [None]:
import altair as alt
from IPython.display import display

# Maximum number of points to plot per form
MAX_POINTS = 2000

# Force specific forms
forms = ["Ballad", "Sonnet"]

subset = df_embeddings[df_embeddings["form"].isin(forms)]

if len(subset) > MAX_POINTS:
    subset = subset.sample(n=MAX_POINTS, random_state=42)

chart = (
    alt.Chart(subset, title="PPA Word Embeddings: Ballad vs Sonnet")
    .mark_circle(size=60, opacity=0.8)
    .encode(
        x=alt.X("x:Q", scale=alt.Scale(zero=False)),
        y=alt.Y("y:Q", scale=alt.Scale(zero=False)),
        color=alt.Color(
            "form:N",
            scale=alt.Scale(scheme="category10"),   # qualitative scheme
            legend=alt.Legend(title="Form")
        ),
        tooltip=[
            "form:N", "title", "author", "pub_year:Q",
            "context", "work_id:N", "char_start:Q", "char_end:Q", "cluster:N"
        ],
    )
    .interactive()
    .properties(width=600, height=600)
)

display(chart)
chart.save("PPA-embeddings-2k-balladvssonnet.html")


In [60]:
def assign_broad_period(year):
    if 1532 <= year <= 1659:
        return "Early Modern"
    elif 1660 <= year <= 1784:
        return "Restoration & 18th C"
    elif 1785 <= year <= 1829:
        return "Romantic"
    elif 1830 <= year <= 1889:
        return "Victorian"
    elif 1890 <= year <= 1929:
        return "Modernist / Early 20th"
    else:
        return "Other / Out of Range"


# -----------------------------------------------
# 2. Focus periods (your 40-year windows)
# -----------------------------------------------
def assign_40yr_period(year):
    if 1790 <= year <= 1829:
        return "1790–1829"
    elif 1890 <= year <= 1929:
        return "1890–1929"
    else:
        return None

# -----------------------------------------------
# 3. 10-year tranches inside each 40-year period
# -----------------------------------------------
def assign_10yr_period(year):
    # Romantic focus window
    if 1790 <= year <= 1829:
        if 1790 <= year <= 1799: return "1790–1799"
        if 1800 <= year <= 1809: return "1800–1809"
        if 1810 <= year <= 1819: return "1810–1819"
        if 1820 <= year <= 1829: return "1820–1829"

    # Modernist focus window
    if 1890 <= year <= 1929:
        if 1890 <= year <= 1899: return "1890–1899"
        if 1900 <= year <= 1909: return "1900–1909"
        if 1910 <= year <= 1919: return "1910–1919"
        if 1920 <= year <= 1929: return "1920–1929"

    return None  # not in a 10-year tranche


# -----------------------------------------------
# Apply the mappings to your embeddings dataframe
# -----------------------------------------------
df_embeddings["period_broad"] = df_embeddings["pub_year"].apply(assign_broad_period)
df_embeddings["period_40yr"]   = df_embeddings["pub_year"].apply(assign_40yr_period)
df_embeddings["period_10yr"]   = df_embeddings["pub_year"].apply(assign_10yr_period)


In [None]:
df_embeddings['period_40yr'].value_counts()

period_40yr
1890–1929    111333
1790–1829     25474
Name: count, dtype: int64

In [61]:
#Trying to identify paratext uses of the forms

paratext_uses = df_embeddings.groupby(['work_id', 'form', 'char_start', 'char_end']).size().reset_index(name='counts').sort_values(by = 'counts', ascending=False)

#filter paratext_uses to only those with counts > 1
paratext_uses = paratext_uses[paratext_uses['counts'] > 5]
paratext_uses




#what if I exclude from df_clusters all rows that are in paratext_uses
#doesn't this include not exclude?



df_filtered = df_embeddings.merge(paratext_uses[['work_id', 'form', 'char_start', 'char_end']], on=['work_id', 'form', 'char_start', 'char_end'], how='left', indicator=True)
df_filtered = df_filtered[df_filtered['_merge'] == 'left_only'].drop(columns=['_merge'])

In [None]:
# how about this one
# 126257	njp.32101073250001	Epic	20	26	21

df_embeddings[(df_embeddings['work_id']=='njp.32101073250001') & (df_embeddings['form']=='Epic') & (df_embeddings['char_start']==20) & (df_embeddings['char_end']==26)]

# df_embeddings[(df_embeddings['work_id']=='uc2.ark:/13960/t9765d284') & (df_embeddings['form']=='Ballad') & (df_embeddings['char_start']==7) & (df_embeddings['char_end']==14)]

In [None]:
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

# periods you want
period1 = "1790–1829"
period2 = "1890–1929"

forms = ['Ballad', 'Epic', 'Sonnet', 'Hymn', 'Ode', 'Pastoral', 'Elegy']
results = []

for form in forms:
    # get embeddings for each period
    emb1 = np.vstack(df_embeddings[(df_embeddings["form"] == form) &
                                     (df_embeddings["period_40yr"] == period1)]["embedding"].values)
    emb2 = np.vstack(df_embeddings[(df_embeddings["form"] == form) &
                                     (df_embeddings["period_40yr"] == period2)]["embedding"].values)
    # skip if either period is empty
    if len(emb1) == 0 or len(emb2) == 0:
        continue
    # compute cosine APD
    apd = pairwise_distances(emb1, emb2, metric="cosine").mean()
    results.append({"form": form, "apd": apd, "n_romantic": len(emb1), "n_modernist": len(emb2)})

# convert to DataFrame
apd_df = pd.DataFrame(results)
apd_df.sort_values("apd", ascending=False)


Unnamed: 0,form,apd,n_romantic,n_modernist
4,Ode,0.145577,5906,14078
1,Epic,0.140155,7836,21170
3,Hymn,0.138071,1797,27675
2,Sonnet,0.130614,1399,16048
6,Elegy,0.127438,2594,6219
0,Ballad,0.122727,3391,17092
5,Pastoral,0.120982,2394,6978


In [43]:


def compute_normalized_apd(df, form_col="form", period_col="period_40yr",
                           emb_col="embedding", period1="1790–1829", period2="1890–1929",
                           metric="cosine", random_state=1):
    """
    Computes:
      - within-period APD for each form
      - between-period APD for each form
      - normalized shift = between / mean(within1, within2)
    """
    rng = np.random.default_rng(random_state)
    forms = ['Ballad', 'Epic', 'Sonnet', 'Hymn', 'Ode', 'Pastoral', 'Elegy']
    results = []

    for form in forms:
        # get embeddings for each period
        emb1_list = df[(df[form_col] == form) & (df[period_col] == period1)][emb_col].values
        emb2_list = df[(df[form_col] == form) & (df[period_col] == period2)][emb_col].values

        emb1 = np.vstack(emb1_list)
        emb2 = np.vstack(emb2_list)


        n1, n2 = len(emb1), len(emb2)

        # within-period APD
        D1 = pairwise_distances(emb1, emb1, metric=metric)
        D2 = pairwise_distances(emb2, emb2, metric=metric)
        within1 = D1[np.triu_indices(n1, k=1)].mean()
        within2 = D2[np.triu_indices(n2, k=1)].mean()

        # between-period APD
        between = pairwise_distances(emb1, emb2, metric=metric).mean()

        # normalized shift
        norm_shift = between / ((within1 + within2)/2)

        results.append({
            "form": form,
            "n_romantic": n1,
            "n_modernist": n2,
            "within_romantic": within1,
            "within_modernist": within2,
            "between_periods": between,
            "normalized_shift": norm_shift
        })

    return pd.DataFrame(results)


In [62]:
compute_normalized_apd(df_filtered)

Unnamed: 0,form,n_romantic,n_modernist,within_romantic,within_modernist,between_periods,normalized_shift
0,Ballad,3039,16386,0.112976,0.112854,0.117241,1.038316
1,Epic,7328,18971,0.112857,0.135392,0.132861,1.070384
2,Sonnet,1390,15179,0.129271,0.114724,0.128971,1.05716
3,Hymn,1797,27361,0.133339,0.099731,0.137703,1.181641
4,Ode,5662,13860,0.143032,0.136267,0.143997,1.03113
5,Pastoral,2275,6903,0.120438,0.107829,0.118541,1.038616
6,Elegy,2475,5978,0.129202,0.112384,0.124373,1.029639
