In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import sys
import os

sys.path.append(os.path.join(os.path.abspath(""), "..")) 

In [6]:
from parliament.utils import load_json
from parliament.tags import initial_parse

report_links = load_json("../output/meta/all_report_links.json")

test_report = report_links[5]

preface, html_tags, tag_names, tags_flat = initial_parse(
    test_report[0], test_report[1])

tag_names[:2]
  

[['BeginningOfDay', 'a', 'Debatealone', 'Speech', 'EndOfSection'],
 ['Debatealone',
  'Speech',
  'Speech',
  'Speech',
  'Speech',
  'list',
  'Speech',
  'EndOfSection']]

In [7]:
starter_tags = load_json("../output/meta/starter_tags.json")
tag_counts = load_json("../output/meta/tag_counts.json")

{ k: v for k, v in tag_counts.items() if k not in starter_tags}


{'CentredEndingBold': 97,
 'CentredEndingItalics': 64,
 'ClauseAlone': 448,
 'ContinueSpeech': 3917,
 'DebateDebate': 1,
 'Default': 21,
 'Incorporation': 3,
 'IndentMargin': 94,
 'QOAEnd': 58,
 'StyleIndentMarginaloneArial10ptLeft127cmFirstlin': 19,
 'StyleIndentMarginaloneArial10ptLeft1cmFirstline': 1,
 'SubsAnswer': 982,
 'SupAnswer': 4121,
 'SupQuestion': 4316,
 'Urgency': 18,
 'VoteCount': 1126,
 'VoteResult': 561,
 'VoteText': 1122,
 'list': 215,
 'note': 14}

In [13]:
# properties the config needs
GLOBAL_START = "global_start"
VALID_TAGS = "valid_tags"
VALIDATION_GROUPS = "validation_groups"
END_ON = "end_on"
EOS_START = "eos_start"
IGNORE_START = "ignore_start"
OVERRIDE_GROUP = "override_group"

# global starters
BILL_DEBATE = "BillDebate"
Q_AND_A = "QAndA"
BEGINNING_OF_DAY = "BeginningOfDay"

# local starters
CONT_SPEECH = "ContSpeech"
DEBATE = "Debate"
EOS = "EOS"
MARGIN = "Margin"
MARGIN_NEXT = "MarginNext"
INTERJECTION = "Interjection"
QUESTION = "Question"
ANSWER = "Answer"
QA_END = "QandAEnd"
VOTE = "vote"
NOTE = "note"
SPEECH = "Speech"
FLAG = "Flag"
ENDING = "Ending"


tag_lookup = {
    BEGINNING_OF_DAY: ["BeginningOfDay"],
    SPEECH: ["Speech"],
    CONT_SPEECH: ["a", "ContinueSpeech", "list"],
    BILL_DEBATE: ["BillDebate"],
    DEBATE: ["Debate", "Debatealone", "SubDebate", "DebateDebate"],
    EOS: ["EndOfSection"],
    MARGIN: ["MarginHeading", "IndentMarginalone", "IndentMargin"],
    MARGIN_NEXT: ["IndentMarginTextFollowing"],
    INTERJECTION: ["Interjection", "Intervention", "Urgency"],
    Q_AND_A: ["QOA"],
    QUESTION: ["QType", "QTypealone", "SubsQuestion", "SupQuestion"],
    ANSWER: ["SubsAnswer", "SupAnswer"],
    QA_END: ["QOAEnd"],
    VOTE: ["VoteReason", "VoteCount", "VoteResult", "VoteText"],
    NOTE: ["note"],
    FLAG: ["Incorporation", "Default", "ClauseAlone"],
    ENDING: ["CentredEndingBold", "CentredEndingItalics"],
    EOS: ["EndOfSection"],
}


tag_lookup_flat = { t: k for k, tags in tag_lookup.items() for t in tags }


class_order = {
    BILL_DEBATE: {
        GLOBAL_START: True,
        VALID_TAGS: [DEBATE, SPEECH, CONT_SPEECH, INTERJECTION, VOTE],
        OVERRIDE_GROUP: [DEBATE],
    },
    BEGINNING_OF_DAY: {
        GLOBAL_START: True
    },
    DEBATE: {
        EOS_START: True,
        IGNORE_START: [BILL_DEBATE],
        VALID_TAGS: [
            BILL_DEBATE, DEBATE, SPEECH, CONT_SPEECH, INTERJECTION, NOTE, VOTE
        ],
    },
    Q_AND_A: {
        GLOBAL_START: True,
        OVERRIDE_GROUP: [QUESTION],
        VALID_TAGS: [QUESTION, SPEECH, CONT_SPEECH, INTERJECTION],
        END_ON: [QA_END],
    },
    QUESTION: {
        GLOBAL_START: True,
        IGNORE_START: [Q_AND_A],
        VALID_TAGS: [Q_AND_A, QUESTION, SPEECH, CONT_SPEECH, INTERJECTION],
        END_ON: [QA_END],
    },
    MARGIN: {
        EOS_START: True,
        OVERRIDE_GROUP: [MARGIN_NEXT],
    },
    MARGIN_NEXT: {
        EOS_START: True
    },
    SPEECH: {
        EOS_START: True,
        IGNORE_START: [BILL_DEBATE],
        VALID_TAGS: [SPEECH, CONT_SPEECH],
    },
    ENDING: {
        EOS_START: True
    }
}

In [29]:
from pprint import pprint


class State():
    def __init__(self, starting_tag, ignore_validation=False):
        tag_group = self._lookup_tag_type(starting_tag)

        self._tag_raw = starting_tag
        self.group = tag_group
        self.current = tag_group
        self.group_config = class_order.get(tag_group, {})
        self.was_eos = False
        self.end_next = False
        self.previous = None
        self.validate = not ignore_validation


    def get_state(self):
        return {
            "group": self.group,
            "current": self.current,
            "raw_tag": self._tag_raw,
            "previous": self.previous,
            "was_eos": self.was_eos,
            "config": self.group_config,
            "ignore_validation": not self.validate,
        }


    def print_state(self):
        pprint(
            f"Current Group: {self.group}" +
            f"    Current Tag: {self.current}" +
            f"    Previous Tag: {self.previous}" +
            f"    Was EOS: {self.was_eos}"
        )


    def _lookup_tag_type(self, tag_name):
        tag_class = tag_lookup_flat.get(tag_name,)
        if tag_class is None:
            print(f"Coudnt find: `{tag_name}`, returning Note")
            tag_class = NOTE

        return tag_class


    def _eval_group_property(self, new_group, config_property, config=None):
        if config is None:
            config = self.group_config
        config_property = config.get(config_property, [])
        return config_property and new_group in config_property


    def _is_new_state(self, new_conf):
        ignore_start = new_conf.get(IGNORE_START, False)
        global_start = new_conf.get(GLOBAL_START, False)
        eos_start = new_conf.get(EOS_START, False) and self.was_eos

        return not ignore_start and (global_start or eos_start)


    def update_state(self, next_tag):
        new_group = self._lookup_tag_type(next_tag)

        if new_group == EOS:
            self.was_eos = True
            return

        if self._eval_group_property(new_group, VALID_TAGS) and self.validate:
            self.print_state()
            raise RuntimeError(
                f"Unallowed tag: `{new_group}` for group: `{self.group}`")

        new_conf = class_order.get(new_group, {})

        # happen regardless
        self._tag_raw = next_tag
        self.previous = self.current
        self.current = new_group

        state_change = self._is_new_state(new_conf) or self.end_next

        # start state here
        if state_change:
            self.group = new_group
            self.group_config = new_conf

        if not self.end_next:
            self.end_next = self._eval_group_property(new_group, END_ON)
        else:
            self.end_next = False

        self.was_eos = False
        return state_change




In [30]:
curr_state = State(tags_flat[0], ignore_validation=True)
curr_state.print_state()

new_groupings = []
curr_grouping = []
for i, next_tag in enumerate(tags_flat[1:]):
    if curr_state.update_state(next_tag):
        new_groupings.append(curr_grouping)
        curr_grouping = []
    curr_grouping.append(next_tag)

pprint(new_groupings)

('Current Group: BeginningOfDay    Current Tag: BeginningOfDay    Previous '
 'Tag: None    Was EOS: False')
Coudnt find: `Ending`, returning Note
[['a',
  'Debatealone',
  'Speech',
  'EndOfSection',
  'Debatealone',
  'Speech',
  'Speech',
  'Speech',
  'Speech',
  'list',
  'Speech',
  'EndOfSection'],
 ['QOA',
  'QType',
  'SubsQuestion',
  'SubsAnswer',
  'SupQuestion',
  'SupAnswer',
  'Interjection',
  'SupQuestion',
  'SupAnswer',
  'Interjection',
  'Interjection',
  'Interjection',
  'Interjection',
  'Interjection',
  'Interjection',
  'Interjection',
  'Interjection',
  'Interjection',
  'EndOfSection',
  'SubsQuestion',
  'SubsAnswer',
  'SupQuestion',
  'SupAnswer',
  'SupQuestion',
  'SupAnswer',
  'SupQuestion',
  'SupAnswer',
  'SupQuestion',
  'SupAnswer',
  'SupQuestion',
  'SupAnswer',
  'SupQuestion',
  'SupAnswer',
  'SupQuestion',
  'SupAnswer',
  'SupQuestion',
  'SupAnswer',
  'Interjection',
  'Interjection',
  'EndOfSection',
  'SubsQuestion',
  'SubsAnswer',