In [1]:
import json

# load libraries
import os
import re
import sys

from typing import List

import pandas as pd

from lxml import etree  # nosec B3410

from discovery_utils import PROJECT_DIR


# Add the root directory to the Python path
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))


# global variables
memberships = None
filtered_meta_data = None  # This variable stores the meta-data for each MP




In [3]:
def open_file(file_path: str) -> list[dict]:
    """
    Open the people.json file from Twfy to retrieve meta-data for the members of parliament

    Args:
        file_path(str): The path to the people.json file

    Returns:
        memberships (list): A list of dictionaries. Each dictionary contains the information related to one MP.
    """

    global memberships  # make global so that we only have to open the file once
    with open(file_path, "r") as file:
        data = json.load(file)
        memberships = data["memberships"]
    return memberships

In [220]:
def find_party(member_id: str, peron_id: str, current_date: str) -> str:
    """
    Retrieve the party affiliation of the MP who delivered the speech using the memberships list.

    Args:
        membership_id List[dict]: List of dictionaries that stores the information members of parliament.

    Returns:
        party(str): the affiliated party of the MP who gave the speech.
    """

    party = None
    for item in filtered_meta_data:
        if member_id != 'NA':
            for item in memberships:
                if item.get("id") == member_id:
                    party = item.get("on_behalf_of_id")
        else :
            new_id = peron_id
            for item in filtered_meta_data:
                if item.get("person_id") == new_id:
                    end_date = item.get('end_date').date()
                    _date = current_date.date()
                    start_date = item.get('start_date').date()
                    if start_date <= _date < end_date:
                          party = item.get("on_behalf_of_id")
    
    return party

In [227]:
from datetime import datetime

In [8]:
#create a list of members and person id to check if the function works xoxox
checker_die_check_check = ['uk.org.publicwhip/person/25429',
                           'uk.org.publicwhip/person/25363',
                           'uk.org.publicwhip/person/25363',
                           'uk.org.publicwhip/member/40616',
                           'uk.org.publicwhip/member/40413',
                           'uk.org.publicwhip/member/40077']

list_of_dates = ['2016-02-11', '2016-07-04', '2016-07-04', '2010-06-28', '2010-06-30', '2010-06-30']

In [9]:
#THIS IS IMPORTANT AND WE WILL USE THIS

membership_dict = {item["id"]: item["on_behalf_of_id"] for item in filtered_meta_data}

In [10]:
#THIS IS IMPORTANT AND WE WILL ALSO USE THIS
person_id_dict = {}
for item in filtered_meta_data:
    person_id = item['person_id']  # Corrected assignment
    if person_id in person_id_dict:
        person_id_dict[person_id].append(item)  # Append item if key exists
    else:
        person_id_dict[person_id] = [item]  # Create a new list if key doesn't exist

In [10]:
person_id_dict

{'uk.org.publicwhip/person/10001': [{'end_date': '2001-05-14',
   'end_reason': 'general_election',
   'id': 'uk.org.publicwhip/member/1',
   'identifiers': [{'identifier': '4', 'scheme': 'historichansard_id'}],
   'on_behalf_of_id': 'labour',
   'person_id': 'uk.org.publicwhip/person/10001',
   'post_id': 'uk.org.publicwhip/cons/263',
   'start_date': '1997-05-01',
   'start_reason': 'general_election'},
  {'end_date': '2005-04-11',
   'end_reason': 'general_election_standing',
   'id': 'uk.org.publicwhip/member/687',
   'identifiers': [{'identifier': '4', 'scheme': 'historichansard_id'}],
   'on_behalf_of_id': 'labour',
   'person_id': 'uk.org.publicwhip/person/10001',
   'post_id': 'uk.org.publicwhip/cons/263',
   'start_date': '2001-06-07',
   'start_reason': 'general_election'},
  {'end_date': '2010-04-12',
   'end_reason': 'general_election_standing',
   'id': 'uk.org.publicwhip/member/1604',
   'on_behalf_of_id': 'labour',
   'person_id': 'uk.org.publicwhip/person/10001',
   'po

In [12]:
count = 0
list_id = []
for key, value in person_id_dict.items():
    for item in value:
        item['end_date'] = pd.to_datetime(item['end_date'],format ='%Y-%m-%d').date()
        item['start_date'] = pd.to_datetime(item['start_date'],format ='%Y-%m-%d').date()
        
#         lenght_check = str(item['start_date'])
#         if len(lenght_check) == 4:
#             list_id.append(item['id'])
#             count += 1
#             # print(item)
# print(count)

        
        
        # item['end_date'] = pd.to_datetime(item['end_date'],format ='%Y-%m-%d').date()
        # print(item['start_date'])
        # print(item)
        # item['start_date'] = pd.to_datetime(item['start_date'],format ='%Y-%m-%d').date()
        # print(item)
    
    
    

In [13]:
member_id = 'uk.org.publicwhip/member/40616'
person_id = 'uk.org.publicwhip/person/25363'

In [283]:
membership_dict

{'uk.org.publicwhip/member/1': 'labour',
 'uk.org.publicwhip/member/687': 'labour',
 'uk.org.publicwhip/member/1604': 'labour',
 'uk.org.publicwhip/member/40289': 'labour',
 'uk.org.publicwhip/member/2': 'sinn-fein',
 'uk.org.publicwhip/member/90001': 'sinn-fein',
 'uk.org.publicwhip/member/688': 'sinn-fein',
 'uk.org.publicwhip/member/90124': 'sinn-fein',
 'uk.org.publicwhip/member/1403': 'sinn-fein',
 'uk.org.publicwhip/member/90247': 'sinn-fein',
 'uk.org.publicwhip/member/40071': 'sinn-fein',
 'uk.org.publicwhip/member/3': 'labour',
 'uk.org.publicwhip/member/689': 'labour',
 'uk.org.publicwhip/member/4': 'labour',
 'uk.org.publicwhip/member/690': 'labour',
 'uk.org.publicwhip/member/1486': 'labour',
 'uk.org.publicwhip/member/5': 'labour',
 'uk.org.publicwhip/member/691': 'labour',
 'uk.org.publicwhip/member/1516': 'labour',
 'uk.org.publicwhip/member/40185': 'labour',
 'uk.org.publicwhip/member/6': 'conservative',
 'uk.org.publicwhip/member/692': 'conservative',
 'uk.org.publicwh

In [284]:
start_date = datetime.strptime(entry['start_date'], '%Y-%m-%d')
end_date = datetime.strptime(entry['end_date'], '%Y-%m-%d')

In [14]:
def test_function(member_id, person_id, current_date):
    party = None
    debate_date = pd.to_datetime(current_date, format ='%Y-%m-%d').date()

    if member_id != 'NA':
        party = membership_dict[member_id]
        print("This is a member and this is the party", party)
        return
    else:
        list_possibilities = person_id_dict[person_id]
        for possibility in list_possibilities:
            print(possibility['start_date'])
            print(possibility['end_date'])
            if possibility['start_date'] <= debate_date <= possibility['end_date']:
                party = possibility['on_behalf_of_id']
                print(possibility)
                print("This is a person and this is the party", party)
                return


In [17]:
test_function('NA','uk.org.publicwhip/person/25429', '2016-02-11')

2015-05-08
2017-05-03
{'end_date': datetime.date(2017, 5, 3), 'end_reason': 'dissolution', 'id': 'uk.org.publicwhip/member/41313', 'on_behalf_of_id': 'labour', 'person_id': 'uk.org.publicwhip/person/25429', 'post_id': 'uk.org.publicwhip/cons/16', 'start_date': datetime.date(2015, 5, 8), 'start_reason': 'general_election'}
This is a person and this is the party labour


In [288]:
party = None
for index, item in enumerate(checker_die_check_check):
    debate_date = pd.to_datetime(list_of_dates[index], format ='%Y-%m-%d').date()

    for item in filtered_meta_data:
        if member_id != 'NA':
            party = membership_dict[member_id]
            print(index)
            print("This is a member and this is the party", party)
        else :
            list_possibilities = person_id_dict[person_id]
            for possibility in list_possibilities:
                start_date = pd.to_datetime(possibility['start_Date'],format ='%Y-%m-%d').date()
                end_date = pd.to_datetime(possibility['end_Date'],format ='%Y-%m-%d').date()      
            # Check if debate date is within the start and end dates
            if start_date <= debate_date <= end_date:
                party = possibility['on_behalf_of_id']
                print(index)
                print("This is a person and this is the party", party)
                
    


0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this is the party conservative
0
This is a member and this 

In [287]:
party

'conservative'

In [275]:
len(membership_dict)

7416

In [273]:
person_id_dict['uk.org.publicwhip/person/25429']

[{'end_date': '2017-05-03',
  'end_reason': 'dissolution',
  'id': 'uk.org.publicwhip/member/41313',
  'on_behalf_of_id': 'labour',
  'person_id': 'uk.org.publicwhip/person/25429',
  'post_id': 'uk.org.publicwhip/cons/16',
  'start_date': '2015-05-08',
  'start_reason': 'general_election'},
 {'end_date': '2019-11-06',
  'end_reason': 'dissolution',
  'id': 'uk.org.publicwhip/member/41738',
  'name': {'family_name': 'Rayner', 'given_name': 'Angela'},
  'on_behalf_of_id': 'labour',
  'person_id': 'uk.org.publicwhip/person/25429',
  'post_id': 'uk.org.publicwhip/cons/16',
  'start_date': '2017-06-09',
  'start_reason': 'general_election'},
 {'end_date': '2024-05-30',
  'end_reason': 'dissolution',
  'id': 'uk.org.publicwhip/member/42423',
  'on_behalf_of_id': 'labour',
  'person_id': 'uk.org.publicwhip/person/25429',
  'post_id': 'uk.org.publicwhip/cons/16',
  'start_date': '2019-12-13',
  'start_reason': 'general_election'}]

In [272]:
person_id_dict.keys()

dict_keys(['uk.org.publicwhip/person/10001', 'uk.org.publicwhip/person/10002', 'uk.org.publicwhip/person/10003', 'uk.org.publicwhip/person/10004', 'uk.org.publicwhip/person/10005', 'uk.org.publicwhip/person/10006', 'uk.org.publicwhip/person/10007', 'uk.org.publicwhip/person/10008', 'uk.org.publicwhip/person/10009', 'uk.org.publicwhip/person/10010', 'uk.org.publicwhip/person/10011', 'uk.org.publicwhip/person/10012', 'uk.org.publicwhip/person/10013', 'uk.org.publicwhip/person/10014', 'uk.org.publicwhip/person/10015', 'uk.org.publicwhip/person/10016', 'uk.org.publicwhip/person/10017', 'uk.org.publicwhip/person/10018', 'uk.org.publicwhip/person/10019', 'uk.org.publicwhip/person/10020', 'uk.org.publicwhip/person/10021', 'uk.org.publicwhip/person/10022', 'uk.org.publicwhip/person/10023', 'uk.org.publicwhip/person/10024', 'uk.org.publicwhip/person/10025', 'uk.org.publicwhip/person/10026', 'uk.org.publicwhip/person/10027', 'uk.org.publicwhip/person/10028', 'uk.org.publicwhip/person/10029', 'uk

In [263]:
person_id_list = []
person_id = 'uk.org.publicwhip/person/25429'
person_id_dict = {}
for item in filtered_meta_data:
    if item['person_id'] == person_id:
        person_id_list.append(item)
    person_id_dict[person_id] = person_id_list




In [267]:
entries = person_id_dict.get('uk.org.publicwhip/person/25429', [])

debate_date = datetime.strptime('2016-02-11', '%Y-%m-%d')
    
for entry in entries:
    print(entry)
    start_date = datetime.strptime(entry['start_date'], '%Y-%m-%d')
    end_date = datetime.strptime(entry['end_date'], '%Y-%m-%d')
        
    # Check if debate date is within the start and end dates
    if start_date <= debate_date <= end_date:
        print(entry['on_behalf_of_id'])
        break

{'end_date': '2017-05-03', 'end_reason': 'dissolution', 'id': 'uk.org.publicwhip/member/41313', 'on_behalf_of_id': 'labour', 'person_id': 'uk.org.publicwhip/person/25429', 'post_id': 'uk.org.publicwhip/cons/16', 'start_date': '2015-05-08', 'start_reason': 'general_election'}
labour


In [264]:
person_id_dict

{'uk.org.publicwhip/person/25429': [{'end_date': '2017-05-03',
   'end_reason': 'dissolution',
   'id': 'uk.org.publicwhip/member/41313',
   'on_behalf_of_id': 'labour',
   'person_id': 'uk.org.publicwhip/person/25429',
   'post_id': 'uk.org.publicwhip/cons/16',
   'start_date': '2015-05-08',
   'start_reason': 'general_election'},
  {'end_date': '2019-11-06',
   'end_reason': 'dissolution',
   'id': 'uk.org.publicwhip/member/41738',
   'name': {'family_name': 'Rayner', 'given_name': 'Angela'},
   'on_behalf_of_id': 'labour',
   'person_id': 'uk.org.publicwhip/person/25429',
   'post_id': 'uk.org.publicwhip/cons/16',
   'start_date': '2017-06-09',
   'start_reason': 'general_election'},
  {'end_date': '2024-05-30',
   'end_reason': 'dissolution',
   'id': 'uk.org.publicwhip/member/42423',
   'on_behalf_of_id': 'labour',
   'person_id': 'uk.org.publicwhip/person/25429',
   'post_id': 'uk.org.publicwhip/cons/16',
   'start_date': '2019-12-13',
   'start_reason': 'general_election'}]}

In [262]:
person_id_list

[{'end_date': '2017-05-03',
  'end_reason': 'dissolution',
  'id': 'uk.org.publicwhip/member/41313',
  'on_behalf_of_id': 'labour',
  'person_id': 'uk.org.publicwhip/person/25429',
  'post_id': 'uk.org.publicwhip/cons/16',
  'start_date': '2015-05-08',
  'start_reason': 'general_election'},
 {'end_date': '2019-11-06',
  'end_reason': 'dissolution',
  'id': 'uk.org.publicwhip/member/41738',
  'name': {'family_name': 'Rayner', 'given_name': 'Angela'},
  'on_behalf_of_id': 'labour',
  'person_id': 'uk.org.publicwhip/person/25429',
  'post_id': 'uk.org.publicwhip/cons/16',
  'start_date': '2017-06-09',
  'start_reason': 'general_election'},
 {'end_date': '2024-05-30',
  'end_reason': 'dissolution',
  'id': 'uk.org.publicwhip/member/42423',
  'on_behalf_of_id': 'labour',
  'person_id': 'uk.org.publicwhip/person/25429',
  'post_id': 'uk.org.publicwhip/cons/16',
  'start_date': '2019-12-13',
  'start_reason': 'general_election'}]

In [None]:
for item in filtered_meta_data:
    membership_dict[item["id"]].append(item["on_behalf_of_id"])

In [254]:
filtered_meta_data[0]

{'end_date': '2001-05-14',
 'end_reason': 'general_election',
 'id': 'uk.org.publicwhip/member/1',
 'identifiers': [{'identifier': '4', 'scheme': 'historichansard_id'}],
 'on_behalf_of_id': 'labour',
 'person_id': 'uk.org.publicwhip/person/10001',
 'post_id': 'uk.org.publicwhip/cons/263',
 'start_date': '1997-05-01',
 'start_reason': 'general_election'}

In [257]:
for item in filtered_meta_data:
    if item['person_id'] == 'uk.org.publicwhip/person/25429':
        print(item)

{'end_date': '2017-05-03', 'end_reason': 'dissolution', 'id': 'uk.org.publicwhip/member/41313', 'on_behalf_of_id': 'labour', 'person_id': 'uk.org.publicwhip/person/25429', 'post_id': 'uk.org.publicwhip/cons/16', 'start_date': '2015-05-08', 'start_reason': 'general_election'}
{'end_date': '2019-11-06', 'end_reason': 'dissolution', 'id': 'uk.org.publicwhip/member/41738', 'name': {'family_name': 'Rayner', 'given_name': 'Angela'}, 'on_behalf_of_id': 'labour', 'person_id': 'uk.org.publicwhip/person/25429', 'post_id': 'uk.org.publicwhip/cons/16', 'start_date': '2017-06-09', 'start_reason': 'general_election'}
{'end_date': '2024-05-30', 'end_reason': 'dissolution', 'id': 'uk.org.publicwhip/member/42423', 'on_behalf_of_id': 'labour', 'person_id': 'uk.org.publicwhip/person/25429', 'post_id': 'uk.org.publicwhip/cons/16', 'start_date': '2019-12-13', 'start_reason': 'general_election'}


In [252]:
person_id_dict = {item["person_id"]: item[["on_behalf_of_id", "start_date", "end_date"]] for item in filtered_meta_data}

TypeError: unhashable type: 'list'

In [250]:
person_id_dict

{'uk.org.publicwhip/person/10001': 'labour',
 'uk.org.publicwhip/person/10002': 'sinn-fein',
 'uk.org.publicwhip/person/10003': 'labour',
 'uk.org.publicwhip/person/10004': 'labour',
 'uk.org.publicwhip/person/10005': 'labour',
 'uk.org.publicwhip/person/10006': 'conservative',
 'uk.org.publicwhip/person/10007': 'liberal-democrat',
 'uk.org.publicwhip/person/10008': 'labour',
 'uk.org.publicwhip/person/10009': 'conservative',
 'uk.org.publicwhip/person/10010': 'conservative',
 'uk.org.publicwhip/person/10011': 'labour',
 'uk.org.publicwhip/person/10012': 'labour',
 'uk.org.publicwhip/person/10013': 'conservative',
 'uk.org.publicwhip/person/10014': 'labour',
 'uk.org.publicwhip/person/10015': 'liberal-democrat',
 'uk.org.publicwhip/person/10016': 'labour',
 'uk.org.publicwhip/person/10017': 'labour',
 'uk.org.publicwhip/person/10018': 'labour',
 'uk.org.publicwhip/person/10019': 'conservative',
 'uk.org.publicwhip/person/10020': 'conservative',
 'uk.org.publicwhip/person/10021': 'labou

In [None]:
party = []
for index,item in enumerate(checker_die_check_check):
    if member_id != 'NA':
        # Create a dictionary for fast lookups of memberships by ID
        membership_dict = {item["id"]: item["on_behalf_of_id"] for item in memberships}

        # return membership_dict.get(member_id)

    # If member_id is 'NA', check filtered_meta_data for person_id
    for item in filtered_meta_data:
        if item.get("person_id") == person_id:
            start_date = item.get('start_date').date()
            end_date = item.get('end_date').date()

            if start_date <= current_date_obj < end_date:
                # return item.get("on_behalf_of_id")

In [None]:
from datetime import datetime

def find_party(member_id: str, person_id: str, current_date: str, memberships: List[Dict], filtered_meta_data: List[Dict]) -> Optional[str]:
# Convert current_date string to a date object
    current_date_obj = datetime.strptime(current_date, '%Y-%m-%d').date()

    if member_id != 'NA':
        # Create a dictionary for fast lookups of memberships by ID
        membership_dict = {item["id"]: item["on_behalf_of_id"] for item in memberships}
        return membership_dict.get(member_id)

    # If member_id is 'NA', check filtered_meta_data for person_id
    for item in filtered_meta_data:
        if item.get("person_id") == person_id:
            start_date = item.get('start_date').date()
            end_date = item.get('end_date').date()

            if start_date <= current_date_obj < end_date:
                return item.get("on_behalf_of_id")

    return None

In [221]:

# --------------------------------------------------------------------------------------------------------------------
# Define functions:

def FilePaths(directory: str) -> List[str]:
    """
    Create a list that stores the path files to the debates

    Args:
        directory (str): The filepath that leads you to where you have stored the debate transcripts.


    Returns:
        list: list of filepaths to the debates.
    """

    filenamelist = []
    for path, _, filenames in os.walk(directory):
        for file in filenames:
            filenamelist.append(os.path.join(path, file))
    return filenamelist


def select_debates_per_year(filelist: List[str], current_year: int) -> List[str]:
    """
    From the list that stores the path files to the debates, select the ones that occurred in the year you are processing.

    Args:
        filelist (list): the list that has all the debatefiles stored
        current_year (int): current_year is the year of which you want to retrieve the debates

    Returns:
        year_list list[str]: list of the debate files for the current_year.
    """
    year_list = []
    for i in filelist:
        match = re.search(str(current_year), i)
        if match:
            year_list.append(i)
    return year_list


def extract_text(elem: object) -> str:
    """
    Retrieve the text of the speeches from the lxml boject.

    Args:
        elem (object): xml element with elem.tag 'speech'

    Returns:
      speech_text (str): The text of the speech.

    """
    speech_text = ""
    # Replace <br> tags with newlines
    for br in elem.xpath(".//br"):
        br.tail = " " + (br.tail if br.tail else "")
    # Extract the full text content
    speech_text = "".join(elem.xpath(".//text()"))
    return speech_text


def clean_text(text: str) -> str:
    """
    Clean the speeches by removing white spaces and white lines.

    Args:
        text (str): the text of the speech

    Returns:
        connect_speech (str): the cleaned version of the speech
    """
    cleaned_text = text.strip()  # removes spaces, tabs, newlines
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace("\t", "")
    lines = cleaned_text.splitlines()  # split the text into seperate lines
    non_empty_lines = [line for line in lines if line.strip()]  # remove all the blank lines in the speech
    connect_speech = " ".join(non_empty_lines)  # concatenate all the non-blank lines from the speech.
    return connect_speech


# def find_party(membership_id: str) -> str:
#     """
#     Retrieve the party affiliation of the MP who delivered the speech using the memberships list.

#     Args:
#         membership_id str: membership id you retrieved from theyworkforyou.

#     Returns:
#         party(str): the affiliated party of the MP who gave the speech.
#     """
#     party = None
#     for item in memberships:
#         if item.get("id") == membership_id:
#             party = item.get("on_behalf_of_id")
#     return party


# Retrieve meta-data from MPs to assign each speaker to their party
# def open_file(file_path: str) -> list[dict]:
#     """
#     Open the people.json file from Twfy to retrieve meta-data for the members of parliament

#     Args:
#         file_path(str): The path to the people.json file

#     Returns:
#         memberships (list): A list of dictionaries. Each dictionary contains the information related to one MP.
#     """

#     global memberships  # make global so that we only have to open the file once
#     with open(file_path, "r") as file:
#         data = json.load(file)
#         memberships = data["memberships"]
#     return memberships


def get_speeches(temp_list: list, current_year: int) -> List[dict]:
    """
    Parse the debatefile and retrieve the following information form the file:

        - Major_heading: Major debate heading
        - Minor_heading:  sub title of a debate
        - Speeches: The text of the speech
        - speakername = The name of the MP who gave the speech
        - speaker_id = The member_id given to the MP by Twfy (mysociety)
        - party_speaker = The party the MP who gave the speech belongs to
        - person_id = The person_id given to the MP by Twfy (mysociety)
        - speech_type = The type of speech given.


    Args:
        temp_list (list): list that stores the debates to analyse.
        current_year (int): passes the year that is being processed

    Returns:
        data (list): stores the dictionaries that contain the information of every speech.
                    Every dictionary contains the information of one speech.
    """

    data = []
    for item in temp_list:
        parser = etree.XMLParser(dtd_validation=False)
        tree = etree.parse(item, parser)  # nosec B320
        condition = bool(tree.xpath("//minor-heading"))

        if condition:
            root = tree.getroot()
            context = root.findall("./")

            debate_year = str(current_year)
            if len(item) == 98:
                date_current = item[-15:-5]
            else:
                date_current = item[-14:-4]
            
            current_major_heading = None
            current_minor_heading = None

            # Iterate over the elements
            for elem in context:
                party_speaker = None

                if elem.tag == "major-heading":
                    store_text = clean_text(elem.text)
                    # Update major heading and reset minor heading if it's a new major heading
                    if store_text != current_major_heading:
                        current_major_heading = store_text
                        current_minor_heading = None
                elif elem.tag == "minor-heading":
                    current_minor_heading = clean_text(elem.text)
                elif elem.tag == "speech":
                    # get attributes
                    sp_id = elem.get("id")
                    speakername = elem.get("speakername", "NA")
                    speaker_id = elem.get("speakerid", "NA")
                    person_id = elem.get("person_id", "NA")
                    party_speaker = find_party(speaker_id, person_id, date_current)
                    # if speaker_id == 'NA':
                    #     party_speaker = find_party(person_id, date_current)
                    # else:
                    #     party_speaker = find_party(speaker_id, date_current)
                        
                    speech_type = elem.get("type", "NA")

                    # get actual speech
                    speechtext = extract_text(elem)
                    clean_speech = clean_text(speechtext)
                    data.append(
                        {
                            "major_heading": current_major_heading,
                            "minor_heading": current_minor_heading,
                            "speech": clean_speech,
                            "speech_id": sp_id,
                            "speakername": speakername,
                            "speaker_id": speaker_id,
                            "person_id": person_id,
                            "party_speaker": party_speaker,
                            "speech_type": speech_type,
                            "year": debate_year,
                            "date": date_current,
                        }
                    )
     

    return data


# # getthedata
# if __name__ == "__main__":

#     # Retrieve meta-data from MPs to assign each speaker to their party.
#     path = PROJECT_DIR / "policy_scanning/data/parlparse/members/people.json"
#     meta_data = open_file(path)

#     # Select the paths to the debate files
#     data_path = PROJECT_DIR / "policy_scanning/data/scrapedxml/debates"
#     list_of_datafiles = sorted(FilePaths(data_path))

#     list_of_df_debates = []  # Every dataframe in this list represents the debates held in one year.

#     for year in range(2005, 2025):
#         print("processing:", year)  # noqa:<T001>

#         # Get all the debates that belong to one year
#         one_year_list = select_debates_per_year(list_of_datafiles, year)

#         # Get the speeches
#         debates = get_speeches(one_year_list, year)
#         df_one_year = pd.DataFrame(debates)
#         list_of_df_debates.append(df_one_year)

#     # Concatenate all DataFrames in the list
#     combined_df = pd.concat(list_of_df_debates, ignore_index=True)

#     folder_path = PROJECT_DIR / "policy_scanning/data"
#     os.makedirs(folder_path, exist_ok=True)
#     file_parquet = os.path.join(folder_path, "HansardDebates.parquet")
#     # parquet_file
#     combined_df.to_parquet(file_parquet, engine="pyarrow")

In [4]:
# Retrieve meta-data from MPs to assign each speaker to their party.
path = PROJECT_DIR / "policy_scanning/data/parlparse/members/people.json"
meta_data = open_file(path)

In [5]:
from typing import List, Dict

def clean_meta_data(meta_data: List[Dict]) -> List[Dict]:
    global filtered_meta_data
    only_2000 = []

    start_date = None
    
    for item in meta_data:
        # Check if 'start_date' and 'end_date' are present
        if 'start_date' in item:
            start_date = str(item['start_date']) 
            if len(start_date) > 4:
                if 'end_date' in item:
                    if item['end_date'][:-8] == '20':
                        if 'on_behalf_of_id' in item:
                            only_2000.append(item)
            # if and 'end_date' in item:
            # # Check if the year in 'end_date' is '2000'
            # if item['end_date'][:4] == '2000':  # Assuming end_date is in 'YYYY-MM-DD' format
            #     only_2000.append(item)
    filtered_meta_data = only_2000
    return only_2000

In [6]:
meta_check = clean_meta_data(meta_data)

In [7]:
len(meta_check)

6976

In [225]:
# # Retrieve meta-data from MPs to assign each speaker to their party.
# path = PROJECT_DIR / "policy_scanning/data/parlparse/members/people.json"
# meta_data = open_file(path)

# Select the paths to the debate files
data_path = PROJECT_DIR / "policy_scanning/data/scrapedxml/debates"
list_of_datafiles = sorted(FilePaths(data_path))

list_of_df_debates = []  # Every dataframe in this list represents the debates held in one year.

for year in range(2015, 2016):
    print("processing:", year)  # noqa:<T001>

    # Get all the debates that belong to one year
    one_year_list = select_debates_per_year(list_of_datafiles, year)
    # print(len(list_speakers))

    # Get the speeches
    # list_of_speakers.append(list_speakers)
    debates = get_speeches(one_year_list, year)
    df_one_year = pd.DataFrame(debates)
    list_of_df_debates.append(df_one_year)

# # Concatenate all DataFrames in the list
# combined_df = pd.concat(list_of_df_debates, ignore_index=True)

# folder_path = PROJECT_DIR / "policy_scanning/data"
# os.makedirs(folder_path, exist_ok=True)
# file_parquet = os.path.join(folder_path, "HansardDebates.parquet")
# # parquet_file
# combined_df.to_parquet(file_parquet, engine="pyarrow")

processing: 2015


KeyboardInterrupt: 