In [1]:
from discovery_utils.getters import hansard
from datetime import datetime, timedelta
import pandas as pd

In [2]:
Hansard = hansard.HansardGetter()
debates_df = Hansard.get_debates_parquet()
labelstore_df = Hansard.get_labelstore()
debates_df.date.max()

2024-12-16 18:21:01,660 - discovery_utils.getters.hansard - INFO - Downloading debates parquet file: data/policy_scanning_data/enriched/HansardDebates.parquet
2024-12-16 18:22:14,760 - discovery_utils.getters.hansard - INFO - Attempting to download label store: data/policy_scanning_data/enriched/HansardDebates_LabelStore_keywords.csv


'2024-12-12'

In [3]:
people_dict = Hansard.get_people_metadata()

2024-12-16 18:23:48,465 - discovery_utils.getters.hansard - INFO - Downloading people metadata
2024-12-16 18:23:51,407 - discovery_utils.getters.hansard - INFO - Successfully downloaded and saved people metadata


## Simple pipeline

In [4]:
import importlib
from src import synthesis_utils
importlib.reload(synthesis_utils);
import numpy as np
from typing import Literal, Tuple, List, Dict

from src import logging
from discovery_utils.utils import keywords

import re

In [5]:
from slack_sdk.webhook import WebhookClient
import os
slack_webhook = WebhookClient(os.environ["SLACK_WEBHOOK_URL_TESTING"])

In [13]:
from typing import List, Dict

def mission_header(mission: str) -> Dict:
    """Construct mission header block"""
    if mission == "ASF":
        mission_header = ":potted_plant: *A Sustainable Future*"
    elif mission == "AFS":
        mission_header = ":hatched_chick: *A Fairer Start*"
    elif mission == "AHL":
        mission_header = ":mending_heart: *A Healthier Life*"
    else:
        raise ValueError(f"Invalid mission: {mission}")

    return {"type": "section", "text": {"type": "mrkdwn", "text": mission_header}}

def message_header(message_date:str, data_start_date:str, data_end_date:str) -> List[Dict]:
    """Construct message header block
    
    Args:
        message_date (str): Date when posting the message, in format DD-MM-YYYY
        data_start_date (str): Start date of the data, in format DD-MM-YYYY
        data_end_date (str): End date of the data, in format DD-MM-YYYY

    Returns:
        List[Dict]: List of blocks including a header and a context block, where 
            context block indicates data sources and date range
    """
    header = {
        "type": "header",
        "text": {
            "type": "plain_text",
            "text": f"Policy update {message_date}",
        }
    }
    context = {
        "type": "context",
        "elements": [
            {
                "type": "mrkdwn",
                "text": f"House of Commons debates ({data_start_date} - {data_end_date})",
            }
        ]
    }
    return [header, context]
    

def divider() -> Dict:
    """Construct a divider block"""
    return {"type": "divider"}

def _bullet_point_string(points: List[str]) -> str:
    """Construct a string of bullet points from a list of strings"""
    return "\n".join([f"• {point}" for point in points])     

def debate_summary(debate: Dict) -> Dict:
    """Construct a block for a single debate summary
    
    Args:
        debate (Dict): Dictionary with keys "title", "summary", "positives", "negatives", and "next_steps".
            For example: {
                "title": "Title of the debate",
                "purpose": "Summary of the debate",
                "positives": ["Positive point 1", "Positive point 2"],
                "negatives": ["Negative point 1", "Negative point 2"],
                "next_steps": ["Next step 1", "Next step 2"],
            }
    """
    summary = {
        "type": "section",
        "text": {
            "type": "mrkdwn",
            "text": f"<{debate['url']}|*{debate['title']}*> ({debate['date']})\n{debate['purpose']}"
        }
    }
    positives = {
        "type": "section",
        "text":{
             "type": "mrkdwn",
             "text": f"*Positives*\n{_bullet_point_string(debate['positives'])}"
        }
    }
    negatives = {
        "type": "section",
        "text":{
             "type": "mrkdwn",
             "text": f"*Negatives*\n{_bullet_point_string(debate['negatives'])}"
        }
    }
    next_steps = {
        "type": "section",
        "text":{
             "type": "mrkdwn",
             "text": f"*Next Steps*\n{_bullet_point_string(debate['next_steps'])}"
        }
    }
    return [summary, positives, negatives, next_steps, divider()]

def quote_block(quote: Dict) -> Dict:
    """Construct a block with a quote
    
    Args:
        quote (Dict): Dictionary with keys "name", "party", "category", "debate", and "text".
    """
    return {
        "type": "section",
        "text": {
            "type": "mrkdwn",
            "text": f"*{quote['name']}* ({quote['party']}) mentioned *{quote['category']}* in *{quote['debate']}*\n\n> {quote['text']}"
        }
    }



In [7]:

def people_party_memberships(people_dict: dict) -> pd.DataFrame:
    """Get the most recent party membership for each person
    
    Args:
        people_dict: The dictionary of people metadata from the Hansard data
    
    Returns:
        A DataFrame that includes 'person_id' and 'name_org' columns
    """
    orgs_df = pd.DataFrame(people_dict['organizations'])[['id', 'name']]
    return (
        pd.DataFrame(people_dict['memberships'])
        .sort_values('start_date', ascending=False)
        .drop_duplicates('person_id')
        .merge(orgs_df, left_on='on_behalf_of_id', right_on='id', how='left', suffixes=("_", "_org"))
    )[['person_id', 'post_id', 'start_date', 'start_reason', 'name_org']] 

def get_weekly_start_date(end_date: str, weeks:int=1) -> str:
    """Get the start date for a weekly period ending at the specified end_date
    
    Args:
        end_date: The end date of the period, in the format "YYYY-MM-DD"
        weeks: The number of weeks to go back

    Returns:
        The start date of the period, in the format "YYYY-MM-DD"
    """
    data_end_date = datetime.strptime(end_date, "%Y-%m-%d")
    weeks_ago = data_end_date - timedelta(weeks=weeks)
    return weeks_ago.strftime("%Y-%m-%d")


def get_speeches_for_period(
    debates_df: pd.DataFrame,
    labelstore_df: pd.DataFrame,
    start_date: str,
    end_date: str
) -> pd.DataFrame:
    """Get the speeches for a given period
    
    Args:
        debates_df: The DataFrame with debates
        labelstore_df: The DataFrame with labels
        start_date: The start date of the period, in the format "YYYY-MM-DD"
        end_date: The end date of the period, in the format "YYYY-MM-DD"
    """
    return (
            debates_df
            .query("date >= @start_date and date <= @end_date")
            .merge(labelstore_df[["id", "mission_labels", "topic_labels"]], left_on="speech_id", right_on="id", how='left')
            .drop_duplicates(subset=["speakername", "speech"])
            .assign(
                headings=lambda df: np.where(
                    df.minor_heading.notna() & (df.minor_heading != ""),  # Check if minor_heading is not empty
                    df.major_heading.fillna("") + ": " + df.minor_heading,  # Combine both
                    df.major_heading.fillna("")  # Use only major_heading
                )
            )
            .merge(parties_df[['person_id', 'name_org']], on='person_id', how='left', suffixes=('', '_person'))
        )

def get_debates_headings(debates_df: pd.DataFrame) -> pd.DataFrame:
    """Get the counts of debate speeches by major and minor headings"""
    return (
        debates_df
        .fillna({"major_heading": "", "minor_heading": ""})
        .groupby(['date', 'major_heading', 'minor_heading', 'headings'])
        .agg(counts=('speech_id', 'count'))
        .sort_values('date')
        .reset_index() 
    )

def get_debates_major_headings(debates_df: pd.DataFrame) -> pd.DataFrame:
    """Get the counts of debate speeches by major headings"""
    return (
        debates_df
        .groupby(["date", "major_heading"])
        .agg(counts=("speech_id", "count"))
        .sort_values("date")
        .reset_index()
    )


def get_debate_text_and_date(debates_df: pd.DataFrame, debate_title: str) -> Tuple[str, str]:
    """Get the text of a debate given its title
    
    Args:
        debates_df: The DataFrame with debates
        debate_title: The title of the debate
    
    Returns:
        A tuple with the debate text and the dates when the debate took place
    """
    _debate = (
        debates_df
        .query("major_heading == @debate_title")
        .sort_values('speech_id')
        .to_dict(orient='records')
    )
    unique_dates = debates_df.query("major_heading == @debate_title").date.unique()
    dates = ", ".join(unique_dates)
    debate_text = debate_title + "\n-----\n"
    for speech in _debate:
        debate_text += f"{speech['speakername']} ({speech['name_org']})" + "\n"
        debate_text += speech['speech'] + "\n"
        debate_text += "-----" + "\n"
    return debate_text, dates


def relevance_check(df: pd.DataFrame, threshold: int = 10, filter: Literal[None, 'relevant', 'not relevant']=None) -> pd.DataFrame:
    """Filter speeches by relevance threshold"""
    df = (
        df
        .assign(relevant = lambda df: df['counts'] >= threshold)
    )
    if filter == 'relevant':
        return df.query("relevant")
    elif filter == 'not relevant':
        return df.query("not relevant")
    else:
        return df

In [14]:
def get_keyword_hits(speech: str, keywords_dict: dict) -> Tuple[List, List, List]:
    """Get keywords and sentences where they appear in a speech"""

    hits_keywords = []
    hits_sentences = []
    hits_categories = []
    marked_sentences = []
    # replace Hon. with Hon
    speech = speech.replace("Hon.", "Hon").replace("hon.", "hon")
    sents = keywords.split_sentences([speech], ids=[0])[0]

    # Fetch general filtering keywords
    keywords_general = None
    for cat in keywords_dict:
        if 'general terms' in cat:
            keywords_general = keywords_dict[cat]
            general_cat = cat
    if keywords_general is not None:
        general_hits = np.array([keywords.find_keyword_hits(kw, sents) for kw in keywords_general]).any(axis=0)
    else:
        general_hits = [True] * len(sents)
        general_cat = "not specified"

    for cat in keywords_dict:
        for kw in keywords_dict[cat]:
            hits = keywords.find_keyword_hits(kw, sents)
            for i, hit in enumerate(hits):
                if (hit and general_hits[i]):
                    # print('----')
                    # print(kw)
                    # print(sents[i])
                    hits_keywords.append(kw)
                    hits_sentences.append(sents[i])
                    hits_categories.append(cat)

                    # Add asterisks around the full words containing the matched keyword
                    marked_sentence = sents[i]
                    for keyword in kw:
                        # Regex to find substrings and expand to full words
                        pattern = r'\b(\S*' + re.escape(keyword) + r'\S*)\b'
                        marked_sentence = re.sub(pattern, r'*\1*', marked_sentence)
                    
                    marked_sentences.append(marked_sentence)   
                
    # print(f"Hit for {hits_keywords} in sentences {hits_sentences}")
    # return hits_categories, hits_keywords, marked_sentences
    df = (
        pd.DataFrame({
            'category': hits_categories,
            'keyword': hits_keywords,
            'sentence': hits_sentences,
            'marked_sentence': marked_sentences,
        })
        .query("category != @general_cat")
        .groupby('sentence')
        # unique category and keyword for each sentence
        .agg(category=('category', list), keyword=('keyword', list), marked_sentence=('marked_sentence', 'first'))
        .reset_index()
    )
    return df

In [9]:
from datetime import datetime, timedelta

def get_all_fridays_last_month(today):
    # Convert today's date to a datetime object
    today = datetime.strptime(today, "%Y-%m-%d")

    # Get the first day of the current month
    first_day_this_month = today.replace(day=1)

    # Get the last day of the previous month
    last_day_previous_month = first_day_this_month - timedelta(days=1)

    # Get the first day of the previous month
    first_day_previous_month = last_day_previous_month.replace(day=1)

    # Find all Fridays in the previous month
    fridays = []
    current_date = first_day_previous_month
    while current_date <= last_day_previous_month:
        if current_date.weekday() == 4:  # 4 corresponds to Friday
            fridays.append(current_date)
        current_date += timedelta(days=1)
    fridays = [date.strftime("%Y-%m-%d") for date in fridays]

    return fridays

# Get all Fridays of the past month
fridays_last_month = get_all_fridays_last_month("2024-11-03")

# Display the result
fridays_last_month

['2024-10-04', '2024-10-11', '2024-10-18', '2024-10-25']

In [10]:
parties_df = people_party_memberships(people_dict)

In [11]:
keywords_dict = {}
for mission in ['ASF', 'AFS', 'AHL']:
    keywords_dict[mission] = keywords.get_keywords(mission)

In [110]:
# for message_date in fridays_last_month:
#     mission = 'AHL'
#     # message_date = datetime.now().strftime("%Y-%m-%d")
#     # message_date = "2024-10-10"
#     # message_date = "2024-11-10"
#     data_end_date = message_date
#     data_start_date = get_weekly_start_date(data_end_date, weeks=1)

#     # Get the speeches of the preceding week
#     weekly_speeches_df = get_speeches_for_period(
#         debates_df=debates_df,
#         labelstore_df=labelstore_df,
#         start_date=data_start_date,
#         end_date=data_end_date
#     )
#     # Select only debates related to one of the missions
#     mission_debates_df = weekly_speeches_df.query("mission_labels == @mission")
#     # Get the major headings
#     mission_debates_major_headings_df = get_debates_major_headings(mission_debates_df)
#     # Filter the debates by relevance (simple threshold)
#     debates_to_summarise_df = relevance_check(mission_debates_major_headings_df, filter='relevant')
#     # Get unique debate titles
#     debate_titles = debates_to_summarise_df.major_heading.unique()

#     print(message_date)
#     print(debate_titles)

In [15]:
def check_more_robust_keywords(mission_debates_df, keywords_dict):
    _df = mission_debates_df.copy()
    _sentences = []
    _categories = []
    _keywords = []
    for _, row in mission_debates_df.iterrows():
        df = get_keyword_hits(row['speech'], keywords_dict)
        _sentences.append(df['sentence'].to_list())
        _categories.append(df['category'].to_list())
        _keywords.append(df['keyword'].to_list())
    _df['sentence'] = _sentences
    _df['category'] = _categories
    _df['keyword'] = _keywords
    _df = _df.assign(n_sentences = lambda df: df['sentence'].apply(len))
    return _df.query("n_sentences > 0")

In [829]:
mission = 'ASF'
# mission = 'AFS'
# mission = 'AHL'
message_date = datetime.now().strftime("%Y-%m-%d")
# message_date = "2024-11-08"
# message_date = "2024-11-22"
# message_date = "2024-12-09"
# message_date = "2024-10-10"
# message_date = "2021-05-28"
data_end_date = message_date
data_start_date = get_weekly_start_date(data_end_date, weeks=1)

# Get the speeches of the preceding week
weekly_speeches_df = get_speeches_for_period(
    debates_df=debates_df,
    labelstore_df=labelstore_df,
    start_date=data_start_date,
    end_date=data_end_date
)
# Select only debates related to one of the missions
# mission_debates_df = weekly_speeches_df.query("mission_labels == @mission")
# mission_debates_df = check_more_robust_keywords(mission_debates_df, keywords_dict[mission])
mission_debates_df = check_more_robust_keywords(weekly_speeches_df, keywords_dict[mission])
# Get the major headings
mission_debates_major_headings_df = get_debates_major_headings(mission_debates_df)
# Filter the debates by relevance (simple threshold)
debates_to_summarise_df = relevance_check(mission_debates_major_headings_df, threshold = 5, filter='relevant')
# Get unique debate titles
debate_titles = debates_to_summarise_df.major_heading.unique()
debates_to_summarise_df

Unnamed: 0,date,major_heading,counts,relevant
4,2024-12-04,End of Radio Teleswitch Service: Rural Areas,8,True


In [816]:
importlib.reload(synthesis_utils);

In [830]:
debate_dicts = []
not_relevant_titles = []
for debate_title in debate_titles:
    # Prepare the debate text
    debate_text, debate_date = get_debate_text_and_date(weekly_speeches_df, debate_title)
    debate_instance = synthesis_utils.Debate(heading=debate_title, content=debate_text)  
    # Check relevance
    relevant = synthesis_utils.classify_relevance(debate_text, mission).relevant 
    if relevant:
        # Call LLM
        logging.info(f"Summarising debate: {debate_title}")
        result = synthesis_utils.summarise_debate_with_structure(debate_instance)
        # Get debate id
        debate_id = weekly_speeches_df.query("major_heading == @debate_title").speech_id.iloc[0]
        debate_id = debate_id.split("/")[-1]
        debate_id = f'{".".join(debate_id.split(".")[0:-1])}.0'        
        debate_url = f"https://www.theyworkforyou.com/debates/?id={debate_id}"
        # Prepare the debate dictionary
        debate_dict = result.model_dump()
        debate_dict['title'] = debate_title
        debate_dict['date'] = debate_date
        debate_dict['url'] = debate_url
        debate_dicts.append(debate_dict)
    else:
        not_relevant_titles.append(debate_title)


In [831]:
not_relevant_titles

['End of Radio Teleswitch Service:  Rural Areas']

In [16]:
def get_speech_context(debates_df: pd.DataFrame, speech_dict: dict) -> str: 
    # Locate the speech index
    speech_index = debates_df[debates_df['speech_id'] == speech_dict['speech_id']].index[0]
    
    # Retrieve previous, current, and next speech data
    # Quick hack to avoid index out of bounds
    try:
        prev_speech = debates_df.iloc[speech_index - 1]
    except IndexError:
        prev_speech = debates_df.iloc[speech_index]
    current_speech = debates_df.iloc[speech_index]
    try:
        next_speech = debates_df.iloc[speech_index + 1]
    except IndexError:
        next_speech = debates_df.iloc[speech_index]    
    
    
    # Format and return the speech context
    return (
        f"# PREVIOUS SPEECH\n"
        f"Speaker: {prev_speech.speakername} ({prev_speech.name_org})\n"
        f"Full speech: {prev_speech.speech}\n"
        f"# SPEECH WITH KEYWORDS\n"
        f"Keywords: {speech_dict['keyword']}\n"
        f"Sentences with keywords: {speech_dict['sentence']}\n"
        f"Speaker: {current_speech.speakername} ({current_speech.name_org})\n"
        f"Full speech: {current_speech.speech}\n"
        f"# NEXT SPEECH\n"
        f"Speaker: {next_speech.speakername} ({next_speech.name_org})\n"
        f"Full speech: {next_speech.speech}\n"
    )

In [834]:
# Debates with few mentions
debates_not_relevant_df = relevance_check(mission_debates_major_headings_df, threshold=5, filter='not relevant')
debates_not_relevant_df

Unnamed: 0,date,major_heading,counts,relevant
0,2024-12-02,Grenfell Tower Inquiry,3,False
1,2024-12-02,"Housing, Communities and Local Government",3,False
2,2024-12-03,National Insurance Contributions (Secondary Cl...,3,False
3,2024-12-03,UK Supply Chains: Uyghur Forced Labour,2,False
5,2024-12-04,Farming and Inheritance Tax,4,False
6,2024-12-04,Scotland,2,False
7,2024-12-05,Business of the House,2,False
8,2024-12-05,Cabinet Office,3,False
9,2024-12-05,Improving Public Transport,1,False
10,2024-12-06,Spray Foam Insulation: Property Value,2,False


In [836]:
speeches_to_report = []
major_headings_not_relevant = mission_debates_major_headings_df.major_heading.unique()
for major_heading in major_headings_not_relevant:
    speech_dicts = mission_debates_df.query("major_heading == @major_heading").to_dict(orient='records')
    n_speeches = len(speech_dicts)
    for speech_dict in speech_dicts:
        text = get_speech_context(weekly_speeches_df, speech_dict)
        _speech_dict = speech_dict.copy()
        _speech_dict['text'] = text
        relevant = synthesis_utils.classify_relevance(text, mission).relevant
        if relevant:
            summary = synthesis_utils.summarise_quote(text).summary
            _speech_dict['relevant'] = relevant
            _speech_dict['summary'] = summary
            speeches_to_report.append(_speech_dict)            


In [801]:
# for major_heading in major_headings_not_relevant:
#     speech_dicts = mission_debates_df.query("major_heading == @major_heading").to_dict(orient='records')
#     for speech_dict in speech_dicts:
#         print(speech_dict['speakername'])
#         print(speech_dict['speech'])
#         print('---')

In [839]:
df = pd.DataFrame(speeches_to_report)
if len(df) > 0:
    unique_headings = df.major_heading.unique()
    quote_dicts = []
    for heading in unique_headings:
        df_report = df.query("major_heading == @heading")
        speech_dicts = df_report.to_dict(orient='records')
        quotes = []
        for speech_dict in speech_dicts:
            quote_id = speech_dict['speech_id'].split("/")[-1]
            quote_url = f"https://www.theyworkforyou.com/debates/?id={quote_id}"
            quotes.append({"summary": speech_dict['summary'], "url": quote_url})
        quote_dict = {
            "heading": heading,
            "date": "; ".join(df_report.date.unique()),
            "quotes": quotes,
        }
        quote_dicts.append(quote_dict)
else:
    quote_dicts = []

In [17]:
def quote_debate_block(quote_dict: Dict) -> List[Dict]:
    """Construct a block with a quote
    
    Args:
        quote (Dict): Dictionary with keys "name", "party", "category", "debate", and "text".
    """
    summary = {
        "type": "section",
        "text": {
            "type": "mrkdwn",
            "text": f"*{quote_dict['heading']}*: Highlights on {quote_dict['date']}."
        }
    }
    quote_blocks = []
    for quote in quote_dict['quotes']:
        _quote = {
            "type": "section",
            "text":{
                "type": "mrkdwn",
                "text": f"{quote['summary']} (<{quote['url']}|source>)"
            }
        }
        quote_blocks.append(_quote)
    return [summary] + quote_blocks + [divider()]

In [528]:
# quotes = []
# for debate_title in debates_not_relevant_df.major_heading.to_list():
#     df = mission_debates_df.query("major_heading == @debate_title")
#     for _, row in df.iterrows():
#         cat_hits, kw_hits, sentences = get_keyword_hits(row['speech'], keywords_dict[mission])
#         if len(kw_hits) == 0:
#             continue
#         text = " .. ".join(sentences)

#         quote = {
#             "name": row['speakername'],
#             "party": row['name_org'],
#             "category": row['topic_labels'],
#             "debate": debate_title,
#             "text": text,
#             "keywords": kw_hits
#         }
#         quotes.append(quote)



In [842]:
debate_blocks = [debate_summary(debate) for debate in debate_dicts]
debate_blocks = [item for sublist in debate_blocks for item in sublist]

# quote_blocks = [quote_block(quote) for quote in quotes]
quote_blocks = [quote_debate_block(quote_dict) for quote_dict in quote_dicts]
# unnest
quote_blocks = [item for sublist in quote_blocks for item in sublist]

blocks = message_header(message_date, data_start_date, data_end_date)
blocks += [mission_header(mission)]
blocks += debate_blocks
blocks += quote_blocks

In [844]:
blocks

[{'type': 'header',
  'text': {'type': 'plain_text', 'text': 'Policy update 2024-12-09'}},
 {'type': 'context',
  'elements': [{'type': 'mrkdwn',
    'text': 'House of Commons debates (2024-12-02 - 2024-12-09)'}]},
 {'type': 'section',
  'text': {'type': 'mrkdwn', 'text': ':potted_plant: *A Sustainable Future*'}},
 {'type': 'section',
  'text': {'type': 'mrkdwn',
   'text': '*Cabinet Office*: Highlights on 2024-12-05.'}},
 {'type': 'section',
  'text': {'type': 'mrkdwn',
   'text': 'Onn (Lab) asked about incorporating *green skills* to create new jobs in North East Lincolnshire, Grimsby, and Cleethorpes. (<https://www.theyworkforyou.com/debates/?id=2024-12-05b.440.5|source>)'}},
 {'type': 'divider'},
 {'type': 'section',
  'text': {'type': 'mrkdwn',
   'text': '*Spray Foam Insulation: Property Value*: Highlights on 2024-12-06.'}},
 {'type': 'section',
  'text': {'type': 'mrkdwn',
   'text': 'Gordon (Lib Dem) highlighted issues with *insulation* and *energy efficiency* from the green ho

In [584]:
# # pretty print the blocks
# import json

# print(json.dumps(blocks, indent=2))

In [813]:
importlib.reload(synthesis_utils);

In [826]:
# Send the message with blocks
response = slack_webhook.send(blocks=blocks)

In [553]:
response.status_code

200

# All missions combined

In [18]:
mission_blocks = []
for mission in ['ASF', 'AFS', 'AHL']:
    message_date = datetime.now().strftime("%Y-%m-%d")
    data_end_date = message_date
    data_start_date = get_weekly_start_date(data_end_date, weeks=1)

    # Get the speeches of the preceding week
    weekly_speeches_df = get_speeches_for_period(
        debates_df=debates_df,
        labelstore_df=labelstore_df,
        start_date=data_start_date,
        end_date=data_end_date
    )
    # Select only debates related to one of the missions
    # mission_debates_df = weekly_speeches_df.query("mission_labels == @mission")
    # mission_debates_df = check_more_robust_keywords(mission_debates_df, keywords_dict[mission])
    mission_debates_df = check_more_robust_keywords(weekly_speeches_df, keywords_dict[mission])
    # Get the major headings
    mission_debates_major_headings_df = get_debates_major_headings(mission_debates_df)
    # Filter the debates by relevance (simple threshold)
    debates_to_summarise_df = relevance_check(mission_debates_major_headings_df, threshold = 5, filter='relevant')
    # Get unique debate titles
    debate_titles = debates_to_summarise_df.major_heading.unique()
    debates_to_summarise_df


    debate_dicts = []
    not_relevant_titles = []
    for debate_title in debate_titles:
        # Prepare the debate text
        debate_text, debate_date = get_debate_text_and_date(weekly_speeches_df, debate_title)
        debate_instance = synthesis_utils.Debate(heading=debate_title, content=debate_text)  
        # Check relevance
        relevant = synthesis_utils.classify_relevance(debate_text, mission).relevant 
        if relevant:
            # Call LLM
            logging.info(f"Summarising debate: {debate_title}")
            result = synthesis_utils.summarise_debate_with_structure(debate_instance)
            # Get debate id
            debate_id = weekly_speeches_df.query("major_heading == @debate_title").speech_id.iloc[0]
            debate_id = debate_id.split("/")[-1]
            debate_id = f'{".".join(debate_id.split(".")[0:-1])}.0'        
            debate_url = f"https://www.theyworkforyou.com/debates/?id={debate_id}"
            # Prepare the debate dictionary
            debate_dict = result.model_dump()
            debate_dict['title'] = debate_title
            debate_dict['date'] = debate_date
            debate_dict['url'] = debate_url
            debate_dicts.append(debate_dict)
        else:
            not_relevant_titles.append(debate_title)

    # Debates with few mentions
    debates_not_relevant_df = relevance_check(mission_debates_major_headings_df, threshold=5, filter='not relevant')
        
    speeches_to_report = []
    major_headings_not_relevant = mission_debates_major_headings_df.major_heading.unique()
    for major_heading in major_headings_not_relevant:
        speech_dicts = mission_debates_df.query("major_heading == @major_heading").to_dict(orient='records')
        n_speeches = len(speech_dicts)
        for speech_dict in speech_dicts:
            text = get_speech_context(weekly_speeches_df, speech_dict)
            _speech_dict = speech_dict.copy()
            _speech_dict['text'] = text
            relevant = synthesis_utils.classify_relevance(text, mission).relevant
            if relevant:
                summary = synthesis_utils.summarise_quote(text).summary
                _speech_dict['relevant'] = relevant
                _speech_dict['summary'] = summary
                speeches_to_report.append(_speech_dict)  

    df = pd.DataFrame(speeches_to_report)
    if len(df) > 0:
        unique_headings = df.major_heading.unique()
        quote_dicts = []
        for heading in unique_headings:
            df_report = df.query("major_heading == @heading")
            speech_dicts = df_report.to_dict(orient='records')
            quotes = []
            for speech_dict in speech_dicts:
                quote_id = speech_dict['speech_id'].split("/")[-1]
                quote_url = f"https://www.theyworkforyou.com/debates/?id={quote_id}"
                quotes.append({"summary": speech_dict['summary'], "url": quote_url})
            quote_dict = {
                "heading": heading,
                "date": "; ".join(df_report.date.unique()),
                "quotes": quotes,
            }
            quote_dicts.append(quote_dict)
    else:
        quote_dicts = []                          
        
    debate_blocks = [debate_summary(debate) for debate in debate_dicts]
    debate_blocks = [item for sublist in debate_blocks for item in sublist]

    # quote_blocks = [quote_block(quote) for quote in quotes]
    quote_blocks = [quote_debate_block(quote_dict) for quote_dict in quote_dicts]
    # unnest
    quote_blocks = [item for sublist in quote_blocks for item in sublist]

    blocks = message_header(message_date, data_start_date, data_end_date)
    blocks += [mission_header(mission)]
    blocks += debate_blocks
    blocks += quote_blocks

    mission_blocks.append(blocks)        

2024-12-16 18:27:00,656 - root - INFO - Summarising debate: SEND Provision: Autism and ADHD


In [48]:
_mission_blocks = mission_blocks[0] + mission_blocks[2][2:] + mission_blocks[1][2:]

In [44]:
mission_blocks[0] + mission_blocks[1][2:]

[{'type': 'header',
  'text': {'type': 'plain_text', 'text': 'Policy update 2024-12-16'}},
 {'type': 'context',
  'elements': [{'type': 'mrkdwn',
    'text': 'House of Commons debates (2024-12-09 - 2024-12-16)'}]},
 {'type': 'section',
  'text': {'type': 'mrkdwn', 'text': ':potted_plant: *A Sustainable Future*'}},
 {'type': 'section',
  'text': {'type': 'mrkdwn',
   'text': '*Finance Bill*: Highlights on 2024-12-10.'}},
 {'type': 'section',
  'text': {'type': 'mrkdwn',
   'text': 'Yang (Lab) said that the energy profits levy will raise £2.3 billion for funding *Great British Energy*, which will innovate in *green technologies* across the UK. (<https://www.theyworkforyou.com/debates/?id=2024-12-10c.843.1|source>)'}},
 {'type': 'divider'},
 {'type': 'section',
  'text': {'type': 'mrkdwn', 'text': ':hatched_chick: *A Fairer Start*'}},
 {'type': 'section',
  'text': {'type': 'mrkdwn',
   'text': '<https://www.theyworkforyou.com/debates/?id=2024-12-12a.1139.0|*SEND Provision: Autism and ADH

In [40]:
# mission_blocks[0] + mission_blocks[1][2:] # + mission_blocks[2][3:]

In [49]:
# Send the message with blocks
response = slack_webhook.send(blocks=_mission_blocks)

In [50]:
response.status_code

400