In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os


sys.path.append(os.path.join(os.path.abspath(""), "..")) 

In [3]:
data_dir = "..\\output\\reports"

available_reports = [os.path.join(data_dir, fn) for fn in os.listdir(data_dir)]
available_reports[:2]

['..\\output\\reports\\debate_20210623_20210623.json',
 '..\\output\\reports\\debate_20210624_20210624.json']

In [4]:
from parliament.utils import load_json
from pprint import pprint

report = load_json(available_reports[3])[1:]

tag_classes = [[tag.get("class") for tag in section] for section in report]
debates = [section for section in tag_classes if any([tag in ["Debate", "BillDebate"] for tag in section])]

debates = []
for section in report:
    if any([tag.get("class") in ["Debate", "BillDebate"] for tag in section]):
        debates.append(section)

pprint(debates[0])

[{'class': 'BillDebate',
  'debates': [{'class': 'SubDebate',
               'speeches': [{'class': 'Speech',
                             'date': '30/06/2021',
                             'speaker': "Anahila Kanongata'A-Suisuiki",
                             'speaker_subtitle': 'Labour',
                             'tag_id': 'Speech',
                             'text': 'I move, That the District Court '
                                     '(Protection of Judgment Debtors with '
                                     'Disabilities) Amendment Bill be now read '
                                     'a third time.',
                             'time': '16:04:50'},
                            {'class': 'a',
                             'filled_speaker': True,
                             'speaker': "Anahila Kanongata'A-Suisuiki",
                             'speaker_subtitle': 'Labour',
                             'tag_id': 'ContSpeech',
                             'text': 'It is a

In [5]:
import pandas as pd


talking_tags = ["Speech", "ContSpeech", "Intervention", "Interjection"]

# parsing json structure to be more usable via pandas
# note: i know this is hacky
def bill_debate_to_pandas(bill_debate):
    if bill_debate["tag_id"] != "BillDebate":
        raise RuntimeError("Not a bill debate")
    debates = bill_debate.get("debates",)
    if debates is None:
        raise RuntimeError("Bill Debate has no debates...")

    output = []
    select_keys = ["speaker", "tag_id", "class", "speaker_subtitle", "text"]
    global_seq = 1
    for debate_id, debate in enumerate(debates):
        speeches = debate.get("speeches")
        speech_output = []
        current_speaker = None
        speaker_seq = 1
        for speech_seq, speech in enumerate(speeches):
            speech_sel = { key: speech.get(key) for key in select_keys }

            is_relevant = speech_sel["tag_id"] in talking_tags
            if not is_relevant:
                continue
            
            speaker = speech_sel.get("speaker",)
            if speaker != current_speaker:
                current_speaker = speaker
                speaker_seq += 1
            speech_sel["speaker_seq"] = speaker_seq
            speech_sel["speech_seq"] = speech_seq + 1
            speech_sel["debate_seq"] = debate_id + 1
            speech_sel["global_seq"] = global_seq

            global_seq += 1
            speech_output.append(speech_sel)
        output += speech_output

    return pd.DataFrame(output)

pd_bill_debate = bill_debate_to_pandas(debates[0][0])
pd_bill_debate

Unnamed: 0,speaker,tag_id,class,speaker_subtitle,text,speaker_seq,speech_seq,debate_seq,global_seq
0,Anahila Kanongata'A-Suisuiki,Speech,Speech,Labour,"I move, That the District Court (Protection of...",2,1,1,1
1,Anahila Kanongata'A-Suisuiki,ContSpeech,a,Labour,It is an honour and a privilege to have been c...,2,2,1,2
2,Anahila Kanongata'A-Suisuiki,ContSpeech,a,Labour,Mr 'Epalahame 'Una Tanginoa has given his appr...,2,3,1,3
3,Anahila Kanongata'A-Suisuiki,ContSpeech,a,Labour,I would like to thank the submitters whose vie...,2,4,1,4
4,Anahila Kanongata'A-Suisuiki,ContSpeech,a,Labour,"At present, the principal Act requires good ju...",2,5,1,5
...,...,...,...,...,...,...,...,...,...
140,Hon Simon Bridges,Interjection,Interjection,,I want to pass Anahila Suisuiki's bill.,47,141,1,141
141,Dr Emily Henderson,ContSpeech,ContinueSpeech,,"I'm sorry, Mr Bridges, that you don't find thi...",48,142,1,142
142,Dr Emily Henderson,ContSpeech,a,,"Mr Speaker, you referred earlier in the House ...",48,143,1,143
143,Speaker,Interjection,Interjection,,Two hearing aids.,49,144,1,144


In [6]:
def get_party(speaker_subtitle, missing="Not Known"):
    if speaker_subtitle is None:
        return missing

    parties = ["National", "Labour", "Green", "Act"]
    for party in parties:
        if party.lower() in speaker_subtitle.lower():
            return party
    return missing


def concat_speaker(pd_debate):
    speakers = {}
    new_rows = []
    seq = 1
    for _, speaker_grouping in pd_debate.groupby("speaker_seq"):
        text_concat = ""
        for text_group in speaker_grouping.text:
            if text_concat:
                text_concat += "\n"
            text_concat += text_group

        speaker = speaker_grouping.speaker.values[0]
        speaker_subtitle = speaker_grouping.speaker_subtitle.values[0]

        if speaker == "Assistant Speaker":
            tmp = speaker
            speaker = speaker_subtitle
            speaker_subtitle = tmp

        if speaker_subtitle is not None:
            speakers.setdefault(speaker, speaker_subtitle)

        new_rows.append({
            "text": text_concat,
            "speaker": speaker,
            "speaker_subtitle": speaker_subtitle,
            "seq": seq
        })
        seq += 1
    fill_speaker = (
        lambda r:
            r["speaker_subtitle"] if r["speaker_subtitle"] is not None
            else speakers.get(r["speaker"],))

    out_pd = pd.DataFrame(new_rows)
    out_pd["speaker_subtitle"] = out_pd.apply(fill_speaker, axis=1)
    out_pd["party"] = out_pd["speaker_subtitle"].apply(get_party)

    return out_pd

text_concat = concat_speaker(pd_bill_debate)
text_concat

Unnamed: 0,text,speaker,speaker_subtitle,seq,party
0,"I move, That the District Court (Protection of...",Anahila Kanongata'A-Suisuiki,Labour,1,Labour
1,The question is that the motion be agreed to.,Hon Jacqui Dean,Assistant Speaker,2,Not Known
2,"Thank you, Madam Speaker. Well, if National wa...",Hon Simon Bridges,National—Tauranga,3,National
3,Away you go.,Ginny Andersen,Labour—Hutt South,4,Labour
4,Well—no. So I'm not going to go through all of...,Hon Simon Bridges,National—Tauranga,5,National
5,"Thank you very much, Madam Speaker, for the op...",Ginny Andersen,Labour—Hutt South,6,Labour
6,Order! The member will use the member's full n...,Hon Jacqui Dean,Assistant Speaker,7,Not Known
7,"Anahila Kanongata'a-Suisuiki—apologies, Madam ...",Ginny Andersen,Labour—Hutt South,8,Labour
8,Order! Order! The member will use the member's...,Hon Jacqui Dean,Assistant Speaker,9,Not Known
9,"Hon Ruth Dyson—sorry, did I say ""Ruth""? Ruth D...",Ginny Andersen,Labour—Hutt South,10,Labour


In [7]:
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dallym\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
from nltk.tokenize import sent_tokenize


col_lookup = text_concat.drop("text", axis=1).to_dict(orient="index")

reordered = []
for idx, text in enumerate(text_concat.text):
    row_meta = col_lookup[idx]
    sentences = sent_tokenize(text)

    for sentence in sentences:
        row_meta_copy = row_meta.copy()
        row_meta_copy["sentence"] = sentence
        reordered.append(row_meta_copy)

sentence_pd = pd.DataFrame(reordered)
sentence_pd

Unnamed: 0,speaker,speaker_subtitle,seq,party,sentence
0,Anahila Kanongata'A-Suisuiki,Labour,1,Labour,"I move, That the District Court (Protection of..."
1,Anahila Kanongata'A-Suisuiki,Labour,1,Labour,It is an honour and a privilege to have been c...
2,Anahila Kanongata'A-Suisuiki,Labour,1,Labour,This bill started with a conversation with Mr ...
3,Anahila Kanongata'A-Suisuiki,Labour,1,Labour,"In 2019, Mr Foliaki shared with me the experie..."
4,Anahila Kanongata'A-Suisuiki,Labour,1,Labour,A story about a disabled person from whom the ...
...,...,...,...,...,...
557,Dr Emily Henderson,Labour—Whangārei,49,Labour,I might even have difficulty leaving the Chamb...
558,Dr Emily Henderson,Labour—Whangārei,49,Labour,"But with my glasses, I'm a fully functioning m..."
559,Dr Emily Henderson,Labour—Whangārei,49,Labour,"This model, this definition, is about changing..."
560,Dr Emily Henderson,Labour—Whangārei,49,Labour,"I fully, fully, fully commend it to the House."


In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

sentence_pd["embedding"] = sentence_pd.sentence.apply(model.encode)
sentence_pd

Unnamed: 0,speaker,speaker_subtitle,seq,party,sentence,embedding
0,Anahila Kanongata'A-Suisuiki,Labour,1,Labour,"I move, That the District Court (Protection of...","[0.050410632, -0.5815163, -0.62931275, -0.1696..."
1,Anahila Kanongata'A-Suisuiki,Labour,1,Labour,It is an honour and a privilege to have been c...,"[-0.04680791, 0.39081562, -0.27951756, -0.2462..."
2,Anahila Kanongata'A-Suisuiki,Labour,1,Labour,This bill started with a conversation with Mr ...,"[-0.11029028, 0.14122818, -0.68386066, 0.15657..."
3,Anahila Kanongata'A-Suisuiki,Labour,1,Labour,"In 2019, Mr Foliaki shared with me the experie...","[-0.005941511, 0.31058884, -0.32434732, -0.252..."
4,Anahila Kanongata'A-Suisuiki,Labour,1,Labour,A story about a disabled person from whom the ...,"[-0.13790677, 0.11944741, 0.029917857, -0.1501..."
...,...,...,...,...,...,...
557,Dr Emily Henderson,Labour—Whangārei,49,Labour,I might even have difficulty leaving the Chamb...,"[0.36336792, 0.17817284, 0.6802208, -0.0376960..."
558,Dr Emily Henderson,Labour—Whangārei,49,Labour,"But with my glasses, I'm a fully functioning m...","[0.37706763, -0.26160085, 0.34292844, 0.015676..."
559,Dr Emily Henderson,Labour—Whangārei,49,Labour,"This model, this definition, is about changing...","[-0.17279601, 0.12556158, -0.057518795, -0.323..."
560,Dr Emily Henderson,Labour—Whangārei,49,Labour,"I fully, fully, fully commend it to the House.","[-0.11378023, 0.504831, 0.1445723, -0.42350313..."


In [11]:
e_list = [e for e in sentence_pd["embedding"].values]

In [12]:
import umap


mapper = umap.UMAP().fit(e_list)
mapper

UMAP(dens_frac=0.0, dens_lambda=0.0)

In [41]:
import umap
import plotly.express as px


def run_umap(embeddings, labels):
    """
    Runs umap on an array of vectors
        and returns a dataframe with the embeddings and class labels.
    """
    umap_model = umap.UMAP(n_components=3)
    umap_comps = umap_model.fit_transform(embeddings)

    return(
        pd.DataFrame({
            "component_1": umap_comps[:, 0],
            "component_2": umap_comps[:, 1],
            "component_3": umap_comps[:, 2],
            "class": labels,
            "row_id": range(len(umap_comps)),
        })
    )


def vis_components(
    components_df,
    xcol="component_1",
    ycol="component_2",
    zcol="component_3",
    colour_col="class",
    height=780,
    width=1366):
    """
    Plots a component dataframe in an interactive 3d plotly plot.
    Returns the plotly figure.
    """
    fig = px.scatter(
        components_df,
        x=xcol,
        y=ycol,
        # z=zcol,
        color=colour_col,
        hover_name="sentence")

    fig.update_traces(
        marker=dict(size=5),
        selector=dict(mode="markers"))

    fig.update_layout(
        margin=dict(l=20, r=20, t=20, b=20),
        height=height, width=width
    )

    return(fig)

# umap_components = run_umap(e_list, sentence_pd["party"].values)
umap_components["sentence"] = sentence_pd["sentence"].values
umap_components["speaker"] = sentence_pd["speaker"].values
umap_components["party"] = sentence_pd["party"].values
umap_components["seq"] = sentence_pd["seq"].values

vis_components(umap_components, colour_col="seq")

In [42]:
vis_components(umap_components, colour_col="party")