Using the McBride and Wark biographies of Acker to construct a detailed composite biography of Acker's life up to the publication of Black Tarantula. Using this fabricated composite biography, we can distill it into data points based around distinct "events" in Acker's life, with a description of the event as well as mention of all actors involved, and finally which sentences from the original biographies were used to inform and define an event.

This can be visualised as a network exploration in itself, but it can also be compared against extracts from Black Tarantula to identify biographical influences on the texts, in the same way more apparent literary sources are identified.

In [2]:
import xml.etree.ElementTree as ET
import os
import json
import nltk
import re

# Create a dictionary to store the chapters for each XML file
chapters_dict = {}

# Parse XML
tree = ET.parse('xml/bio/mcbride.xml')
root = tree.getroot()

# Iterate over all divs with type chapter
for div in root.findall('.//{http://www.tei-c.org/ns/1.0}div[@type="chapter"]'):
    string = ""
    result = []
    # Iterate over all paragraphs in the respective chapter
    for p in div.findall('.//{http://www.tei-c.org/ns/1.0}p'):
        # Removing all empty nodes
        if p.text is None:
            p.text = ''
        # Incorporate the text contents of the <quote> elements into the <p> text
        for quote in p.findall('.//{http://www.tei-c.org/ns/1.0}quote'):
            if quote.text is not None:
                p.text += quote.text
            for lg in quote.findall('.//{http://www.tei-c.org/ns/1.0}lg'):
                if lg.text is not None:
                    p.text += lg.text
                for l in lg.findall('.//{http://www.tei-c.org/ns/1.0}l'):
                    if l.text is not None:
                        p.text += l.text
        string += str(p.text) + " "
    result = "".join(string)

    # Add the chapter with only its respective text to the dictionary
    chapters_dict[div.attrib['n']] = result

# Write the dictionary to a JSON file and save to the xml folder
with open('xml/bio/mcbride_chapters.json', 'w') as fp:
    json.dump(chapters_dict, fp, indent=2)

In [5]:
import torch
from transformers import pipeline
import json

summarizer = pipeline(
    "summarization",
    "pszemraj/led-large-book-summary",
    device=0 if torch.cuda.is_available() else -1,
)

# Load the JSON file with the chapter texts
with open('xml/bio/mcbride_chapters.json', 'r') as fp:
    chapters_dict = json.load(fp)

# Iterate through the chapter texts and summarize each one
for chapter, text in chapters_dict.items():
    result = summarizer(
    text,
    min_length=256,
    max_length=1024,
    no_repeat_ngram_size=3,
    encoder_no_repeat_ngram_size=3,
    repetition_penalty=3.5,
    num_beams=4,
    early_stopping=True,
)
    summary = result[0]["summary_text"]
    # Add the summary to the dictionary with the corresponding chapter
    chapters_dict[chapter] = {"text": text, "summary": summary}

# Write the updated dictionary to the JSON file
with open('xml/bio/mcbride_chapters.json', 'w') as fp:
    json.dump(chapters_dict, fp, indent=2)


Downloading (…)lve/main/config.json: 100%|██████████| 1.44k/1.44k [00:00<00:00, 2.62MB/s]
'(ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443): Read timed out. (read timeout=10.0)"), '(Request ID: 815a50c7-bf2e-420c-8135-18e840b537a9)')' thrown while requesting GET https://cdn-lfs.huggingface.co/pszemraj/led-large-book-summary/0ee872e86edb7c76ea6631b672e2cd3df5df6a749cfea7c8412a319123c87976?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1699545329&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5OTU0NTMyOX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9wc3plbXJhai9sZWQtbGFyZ2UtYm9vay1zdW1tYXJ5LzBlZTg3MmU4NmVkYjdjNzZlYTY2MzFiNjcyZTJjZDNkZjVkZjZhNzQ5Y2ZlYTdjODQxMmEzMTkxMjNjODc5NzY%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=NP2jK45uc7pmrsVktK%7EjnQgOgz7eFSEzE4Do-7JfNRlBvaAWRYcoRgh54LKnljxC15rYnSJpi%7EYjtm

KeyboardInterrupt: 