In [1]:
import pickle
from neo4j import GraphDatabase
from dotenv import dotenv_values
from spacy import displacy
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt

from hp_nlp_graph.scraper import Chapter, Character
from hp_nlp_graph.coreference import (
    coref_resolve_and_get_characters_matches_in_chapter,
    get_interactions,
    get_matcher,
)
from hp_nlp_graph.language import (
    get_coref_resolver_nlp,
    add_entity_ruler,
    FastCoref,
)

In [2]:
chapter_characters = pickle.load(open("data/chapter_characters.pkl", "rb"))
characters = [
    character for chapter in chapter_characters for character in chapter.characters
]
chapter_characters = {
    chapter.chapter: chapter.characters for chapter in chapter_characters
}
chapters = open("./data/books/1 Sorcerer's Stone.txt", "r").read().split("CHAPTER ")[1:]

In [3]:
base_nlp, nlp = get_coref_resolver_nlp(device="cuda:0")
base_nlp = add_entity_ruler(base_nlp, characters)
nlp = add_entity_ruler(nlp, characters)
coref = FastCoref(base_nlp, nlp)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

10/21/2023 21:56:45 - INFO - 	 missing_keys: []
10/21/2023 21:56:45 - INFO - 	 unexpected_keys: []
10/21/2023 21:56:45 - INFO - 	 mismatched_keys: []
10/21/2023 21:56:45 - INFO - 	 error_msgs: []
10/21/2023 21:56:45 - INFO - 	 Model Parameters: 590.0M, Transformer: 434.6M, Coref head: 155.4M


In [4]:
def get_character_dict(chapter_characters, chapter):
    super_list = list()
    dicts = [chapter_characters[i] for i in range(1, chapter + 1)]
    for d in dicts:
        for item in d:
            super_list.append(item)
    return super_list

In [None]:
interactions_by_chapter = {}
for chapter in range(1, len(chapters) + 1):
    # for chapter in range(6, 7):
    result, resolved_doc = coref_resolve_and_get_characters_matches_in_chapter(
        base_nlp=base_nlp,
        nlp=nlp,
        chapter_text=chapters[chapter - 1],
        chapter_characters=get_character_dict(chapter_characters, chapter),
        coref_resolver=coref.resolve,
    )
    interactions_by_chapter[chapter] = dict(get_interactions(result, 14))

In [6]:
with open("data/interactions_by_chapter.pkl", "wb") as f:
    pickle.dump(interactions_by_chapter, f)