In [1]:
import pandas as pd
from tqdm import tqdm

# Import a model to verify the setup
from twfy_vector_explorer import notebook_setup as notebook_setup
from vector_explorer.models import ParagraphVector

pd.set_option("display.max_colwidth", None)

# One line questions

Here we're looking for text that signals we're starting and giving the whole motion in one line (and the parser shouldn't advance for me).

In [3]:
alts = [
    "the Bill be now read a Second time.",
    "the Bill be now read the Third time.",
    "That the House sit in private.",
    "That the Bill be read the Third time.",
    "That the clause be read a Second time.",
    "Main Question again proposed.",
    "Motion made, That the Bill be read be now read a Second time.",
    "Main Question again proposed .",
    "Question again proposed",
    "Motion made, That the Bill be now read a Secondtime.",
    "Question put, That the Bill be read a Second time.",
    "Question put, That the clause be a Second time.",
    "That the Bill be now read a second time",
    "That the Bill will be now read a second time.",
    "That the Bill now be read a third time.",
    "That the Bill be now read a third time.",
    "That the Bill be now read a second time",
]

false_positives = []


def in_alts(x: str):
    lx = x.lower().replace("  ", " ")
    for alt in alts + false_positives:
        if alt.lower() in lx:
            return True
    return False


search_query = alts[0]

all_dfs = []

for a in tqdm(alts):
    df = (
        ParagraphVector.objects.all()
        .search_distance(a, threshold=0.2)
        .df()
        .assign(text=lambda df: df["text"].str.strip())
        .drop_duplicates(subset="text", keep="first")
        .drop(columns=["embedding", "id", "source_file"])[
            lambda df: ~df["text"].apply(in_alts)
        ]
    )
    all_dfs.append(df)

df = pd.concat(all_dfs).drop_duplicates(subset="text", keep="first")
df = df.sort_values("distance").reset_index(drop=True)


# no matches that are an exact match - want to find similar items
df.head(30)

100%|██████████| 17/17 [00:43<00:00,  2.59s/it]


Unnamed: 0,speech_id,text,transcript_type,chamber_type,distance
0,uk.org.publicwhip/debate/2010-01-13d.669.2#d.669.2/1,"Motion made, That the Bill be read a Second time.",debates,uk_commons,0.05182
1,uk.org.publicwhip/debate/2010-01-06b.174.1#b.174.1/6,2. When the Bill has been read a second time-,debates,uk_commons,0.053653
2,uk.org.publicwhip/debate/2012-12-05a.919.1#a.919.1/8,2. When the Bill has been read a second time—,debates,uk_commons,0.055618
3,uk.org.publicwhip/debate/2018-07-09c.750.1#c750.1/9,(2) When the Bill has been read a second time:,debates,uk_commons,0.059468
4,uk.org.publicwhip/debate/2011-01-10b.121.5#b.121.5/5,Bill accordingly read a Second time.,debates,uk_commons,0.060133
5,uk.org.publicwhip/debate/2012-09-03a.123.3#a.123.3/11,Bill accordingly read a Second time .,debates,uk_commons,0.060133
6,uk.org.publicwhip/debate/2013-11-25a.125.6#a.125.6/6,Bill accordingly read a Second Time.,debates,uk_commons,0.060133
7,uk.org.publicwhip/debate/2015-01-27a.796.1#a.796.1/8,Bill accordingly read a second time.,debates,uk_commons,0.060133
8,uk.org.publicwhip/debate/2010-02-05a.571.1#a.571.1/7,Bill accordingly read a Second time.,debates,uk_commons,0.060133
9,uk.org.publicwhip/debate/2012-10-30a.177.1#a.177.1/10,(3) When the Bill has been read a second time—,debates,uk_commons,0.067286
