In [1]:
import pandas as pd

# Import a model to verify the setup
from twfy_vector_explorer import notebook_setup as notebook_setup
from vector_explorer.models import ParagraphVector

pd.set_option("display.max_colwidth", None)

# Agreement search

Here we're looking for text that signifies decisions taken without a vote.

In [2]:
alts = [
    "question put and agreed to",
    "main question accordingly put and agreed to",
    "question put and agreed",
    "question agreed to",
    "Question put and agree d to",
    "Main Question put accordingly and agreed to",
    "Question put (Standing Order No. 23) and agreed to",
    "Main Question, as amended, put and agreed to",
    "Main Question, as amended, put forthwith and agreed to",
    "Question agreed to.",
    "read the First and Second time, and added to the Bill.",
    "Brought up, read the First and Second time, and added to the Bill",
    "Brought up, read the First Time and Second Time and added to the Bill",
    "Question put and agreed to.",
    "Question put (Standing Order No.23) and agreed to.",
    "question put and agreed to",
]

false_positives = [
    "Question accordingly agreed to.",
    "That the Bill be now read a second time.",
    "Bill accordingly read a Second Time",
]


def in_alts(x: str):
    lx = x.lower().replace("  ", " ")
    for alt in alts + false_positives:
        if alt.lower() in lx:
            return True
    return False


search_query = alts[0]

all_dfs = []

for a in alts:
    df = (
        ParagraphVector.objects.all()
        .search_distance(a, threshold=0.3)
        .df()
        .assign(text=lambda df: df["text"].str.strip())
        .drop_duplicates(subset="text", keep="first")
        .drop(columns=["embedding", "id", "source_file"])[
            lambda df: ~df["text"].apply(in_alts)
        ]
    )
    all_dfs.append(df)

df = pd.concat(all_dfs).drop_duplicates(subset="text", keep="first")
df = df.sort_values("distance").reset_index(drop=True)


# no matches that are an exact match - want to find similar items
df.head(30)

Unnamed: 0,speech_id,text,transcript_type,chamber_type,distance
0,uk.org.publicwhip/debate/2011-09-14b.1155.4#b.1155.4/1,Question accordingly agreed to,debates,uk_commons,0.089226
1,uk.org.publicwhip/debate/2011-02-15a.889.1#a.889.1/1,Question accordingly agreed to .,debates,uk_commons,0.092655
2,uk.org.publicwhip/debate/2023-11-29b.1007.1#b1007.1/3,"New schedule 1 read a Second time, and added to the Bill.",debates,uk_commons,0.098849
3,uk.org.publicwhip/debate/2011-03-31c.560.1#c.560.1/6,"New clause 1 read a Second time, and added to the Bill.",debates,uk_commons,0.099363
4,uk.org.publicwhip/debate/2018-01-29c.634.0#c634.0/11,New clause 1 read a Second time and added to the Bill.,debates,uk_commons,0.102575
...,...,...,...,...,...
95,uk.org.publicwhip/debate/2019-05-08a.572.1#a572.1/22,"Bill read the First time; to be read a Second time tomorrow, and to be printed (Bill 389).",debates,uk_commons,0.152964
96,uk.org.publicwhip/debate/2011-03-02a.402.5#a.402.5/3,"Bill read the First time; to be read a Second time tomorrow, and to be printed (Bill 153).",debates,uk_commons,0.152995
97,uk.org.publicwhip/debate/2013-07-03a.1025.3#a.1025.3/3,"Bill read the First time; to be read a Second time tomorrow, and to be printed (Bill 86).",debates,uk_commons,0.152998
98,uk.org.publicwhip/debate/2011-06-14c.680.1#c.680.1/3,"New clause 12 accordingly read a Second time, and added to the Bill.",debates,uk_commons,0.153255
