In [1]:
import pandas as pd

# Import a model to verify the setup
from twfy_vector_explorer import notebook_setup as notebook_setup
from vector_explorer.models import ParagraphVector

pd.set_option("display.max_colwidth", None)

# Motion start text

Here we're looking for text that is used to signal the *start* of a motion.

In [6]:
alts = [
    "I beg to move",
    "I beg move to move",
    "I therefore beg to move,",
    "Motion made, and Question proposed",
    "Amendment proposed: at the end of the Question to add:",
    "Amendment proposed : at the end of the Question to add:",
    "Motion made, and Question put",
    "Question again proposed",
    "Motion made, and Question p roposed",
    "Amendment proposed: (h), at the end of the Question to add:",
    "Motion made, Question put forthwith (Standing Order No. 83A(7)",
    "Amendment proposed (31 January):",
    "Amendment proposed (7 September):",
    "Amendment proposed: (a),",
    "Motion made , and Question proposed",
    "Motion made and Question proposed,",
    "Motion made and question proposed,",
    "Amendment proposed to new clause 1: (a),",
    "With this we will consider the following motion:",
    "With this we shall discuss the following motion:",
    "With this we will also consider the following motion:",
    "With this it will be convenient to discuss amendment (a),",
    "Motion mad e, and Question p roposed ,",
    "Motion made and Question put forthwith",
    "Motion made , and Question put f orthwith",
    "With this we shall consider the following motion:",
]

false_positives = ["Amendment proposed:"]


def in_alts(x: str):
    lx = x.lower().replace("  ", " ")
    for alt in alts + false_positives:
        if alt.lower() in lx:
            return True
    return False


search_query = alts[0]

all_dfs = []

for a in alts:
    df = (
        ParagraphVector.objects.all()
        .search_distance(a, threshold=0.2)
        .df()
        .assign(text=lambda df: df["text"].str.strip())
        .drop_duplicates(subset="text", keep="first")
        .drop(columns=["embedding", "id", "source_file"])[
            lambda df: ~df["text"].apply(in_alts)
        ]
    )
    all_dfs.append(df)

df = pd.concat(all_dfs).drop_duplicates(subset="text", keep="first")
df = df.sort_values("distance").reset_index(drop=True)


# no matches that are an exact match - want to find similar items
df.head(30)

Unnamed: 0,speech_id,text,transcript_type,chamber_type,distance
0,uk.org.publicwhip/debate/2015-06-24a.975.3#a.975.3/1,What about the motion?,debates,uk_commons,0.104393
1,uk.org.publicwhip/debate/2010-03-04a.1096.1#a.1096.1/4,"Amendment (a), at end add",debates,uk_commons,0.114491
2,uk.org.publicwhip/debate/2012-04-18a.473.8#a.473.8/1,"Motion made , and Question put f orthwith (Standing Order No. 119 ( 11 )),",debates,uk_commons,0.116503
3,uk.org.publicwhip/debate/2016-02-09c.1460.0#c1460.0/3,The motion talks about,debates,uk_commons,0.117889
4,uk.org.publicwhip/debate/2017-02-21b.979.1#b979.1/1,With this we shall consider the following motion:,debates,uk_commons,0.124117
5,uk.org.publicwhip/debate/2012-04-18a.473.2#a.473.2/1,"Motion made , and Question put forthwith (Standing Order No. 118(6)),",debates,uk_commons,0.124977
6,uk.org.publicwhip/debate/2012-03-12a.102.1#a.102.1/2,"Amendment made: (a), at end, add",debates,uk_commons,0.126253
7,uk.org.publicwhip/debate/2010-03-04a.1096.1#a.1096.1/6,"Amendment (d), at end add",debates,uk_commons,0.129599
8,uk.org.publicwhip/debate/2012-02-23a.1081.1#a.1081.1/2,"Amendment (b), at end add",debates,uk_commons,0.133083
9,uk.org.publicwhip/debate/2012-02-23a.1081.1#a.1081.1/4,"Amendment (c), at end add",debates,uk_commons,0.136563
