In [43]:
from docx import Document
import os
import re
import json
import pandas as pd

In [44]:
def hebrew_to_number(hebrew_text):
    hebrew_numerals = {
        "אחד": 1, "שתיים": 2, "שלוש": 3, "ארבע": 4, "חמש": 5,
        "שש": 6, "שבע": 7, "שמונה": 8, "תשע": 9, "עשר": 10,
        "עשרים": 20, "שלושים": 30, "ארבעים": 40, "חמישים": 50,
        "שישים": 60, "שבעים": 70, "שמונים": 80, "תשעים": 90,
        "מאה": 100, "מאתיים": 200
    }

    suffixes = {
        "מאות": 100, "עשרה" : 10
    }

    total = 0
    parts = re.split(r'-', hebrew_text)
    # by now we should have an array of the hebrew numbers/suffixes

    for i, part in enumerate(parts):
        part = part.strip()
        
        if part.startswith('ו'):
            part = part[1:]

        if part in hebrew_numerals:
            if i + 1 < len(parts) and parts[i+1] in suffixes:
                if parts[i+1] == "מאות":
                    total += hebrew_numerals[part] * suffixes[parts[i+1]]
                elif parts[i+1] == "עשרה":
                    total += hebrew_numerals[part] + suffixes[parts[i+1]]
                continue
            else:
                total += hebrew_numerals[part]
                
    return total

In [45]:
def parse_protocol_metadata(file_path):
    filename = file_path.split('/')[-1]

    ## knesset num ##
    knesset_num = filename.split('_')[0]
    knesset_num = knesset_num.split('\\')[-1]

    ## type ##
    if filename.split('_')[1][2] == 'm':
        protocol_type = 'plenary'
    elif filename.split('_')[1][2] == 'v':
        protocol_type = 'committee'

    ## protocol number ##
    protocol_number = -1

    document = Document(file_path)
    lines = [para.text for para in document.paragraphs]

    for line in lines[:500]:
        if "פרוטוקול מס'" in line:
            match = re.search(r"פרוטוקול מס'\s*(\d+)", line)
            if match:
                protocol_number = match.group(1)
        if re.search(r"הישיבה ה.*? של הכנסת ה.*", line):
            match = re.search(r"הישיבה ה(.*?) של", line)           # get the hebrew number only
            hebrew_number = match.group(1).strip()                  # extract the text
            protocol_number = hebrew_to_number(hebrew_number)       # use a function to determine the number
    
    return knesset_num, protocol_type, protocol_number

In [46]:
def clean_speaker_name(raw_name):
    prefixes_to_remove = [
        r'היו"ר', r'יו"ר', r'ח"כ', r'ד"ר', r'פרופ\'',
        r'מר', r'גברת',
        r'שר', r'שרת', r'השר', r'השרה',
        r'הבינוי והשיכון',
        r'העבודה הרווחה והשירותים החברתיים',
        r'האוצר',
        r'מזכיר המדינה',
        r'סגנית מזכיר הכנסת',
        r'התחבורה',
        r'הפנים',
        r'המשפטים',
        r'החינוך והתרבות',
        r'הכלכלה והתכנון',
        r'לביטחון פנים'
    ]

    # Create a regex pattern to match any of the prefixes
    prefix_pattern = r'\b(?:' + r'|'.join(prefixes_to_remove) + r')\b'

    cleaned_name = re.sub(r'\(.*?\)', '', raw_name)
    cleaned_name = re.sub(prefix_pattern, '', cleaned_name)
    cleaned_name = re.sub(r'[<>]', '', cleaned_name)
    return cleaned_name.strip()

In [None]:
def extract_sentences(file_path):
    data = []
    found_first_speaker = False

    doc = Document(file_path)
    speaker = None
    speech = []

    for para in doc.paragraphs:
        if(not found_first_speaker):    # beggining was not found
            try:
                is_underlined = (
                    para.style.font.underline or 
                    (para.style.base_style and para.style.base_style.font.underline) or 
                    (para.runs and para.runs[0].underline)
                )
            except AttributeError:
                is_underlined = False
            
            if is_underlined and re.search(r'היו"ר .*:', para.text):
                match = re.match(r'(.*?):\s*(.*)', para.text)
                if match:
                    speaker = match.group(1).strip()
                    speaker = clean_speaker_name(speaker)
                    speech = [match.group(2).strip()]
                    if speech != ['']:
                        data.append({"Speaker": speaker, "Speech": " ".join(speech)})
                found_first_speaker = True
                continue
        else:                           # the beggining was found
            if(para.alignment == 1):    # 1:centered  
                continue                # skip centered paragraphs (titles, votes etc...)
            
            try:
                is_underlined = (
                    para.style.font.underline or 
                    (para.style.base_style and para.style.base_style.font.underline) or 
                    (para.runs and para.runs[0].underline)
                )
            except AttributeError:
                is_underlined = False

            if ":" in para.text and is_underlined:      # potential speaker
                match = re.match(r'(.*?):\s*(.*)', para.text)
                if match:
                    if speaker and speech != ['']:
                        data.append({"Speaker": speaker, "Speech": " ".join(speech)})
                        speech = []
                    
                    speaker = match.group(1).strip()
                    speaker = clean_speaker_name(speaker)
                    speech.append(match.group(2).strip())
            elif speaker and speech:
                if "הישיבה ננעלה בשעה" in speech:
                    continue
                text = para.text.strip()
                if text:
                    speech.append(text)
    
    if speaker:
        data.append({"Speaker": speaker, "Speech": " ".join(speech)})
    
    return data


In [48]:
def produce_corpus(dir_path):
    data = []
    
    for curr_file in os.listdir(dir_path):
        file_path = os.path.join(dir_path, os.fsdecode(curr_file))
        knesset_num, protocol_type, protocol_num = parse_protocol_metadata(file_path)
        data = extract_sentences(file_path)

        # write to file
        output_file = os.path.splitext(curr_file)[0] + ".txt"
        output_path = os.path.join("output", output_file)

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(f"Knesset Number: {knesset_num}\n")
            f.write(f"Type: {protocol_type}\n")
            f.write(f"Protocol Number: {protocol_num}\n\n")
            for entry in data:
                f.write(f"Speaker: {entry['Speaker']}\n")
                f.write(f"Speech: {entry['Speech']}\n")
                f.write("\n")

In [49]:
dir = 'protocols'
produce_corpus(dir)

AttributeError: 'str' object has no attribute 'append'

In [None]:
# def extract_sentences(file_path):
#     sentences = []
#     current_speaker = None
#     meeting_started = False

#     with open(file_path, 'r', encoding='utf-8') as file:
#         for line in file:
#             line = line.strip()
#             if not line:
#                 continue

#             # check if the meeting has started
#             if re.search(r'היו"ר .*?:', line) or re.search(r'<< דובר >> .*?:', line):
#                 meeting_started = True  # Set the flag to True
#                 if '<< דובר >>' in line:
#                     speaker_raw = line.split('>>', 1)[-1].split(':', 1)[0].strip()
#                 else:
#                     speaker_raw = line.split(':', 1)[0].replace("היו\"ר", "").strip()
#                 current_speaker = clean_speaker_name(speaker_raw)
#                 continue

#             if not meeting_started:
#                 continue

#             match = re.search(r'<<.*?>>\s*(.*?):\s*<<.*?>>', line)
#             if match:
#                 speaker_raw = match.group(1)
#                 current_speaker = clean_speaker_name(speaker_raw)
#                 speech = line.split("<<", 1)[0].split(":", 1)[-1].strip()
#                 if speech:
#                     sentences.append(f"({current_speaker}): {speech}")
#                 continue

#             if ':' in line and is_potential_speaker(line):
#                 parts = line.split(':', 1)
#                 speaker_raw = parts[0].strip()
#                 current_speaker = clean_speaker_name(speaker_raw)
#                 spoken_text = parts[1].strip()
#                 if spoken_text:
#                     sentences.append(f"({current_speaker}): {spoken_text}")
#                 continue  # Skip to the next line

#             if current_speaker:
#                 # Skip lines containing the "<<סיום>>" tag
#                 if '<< סיום >>' in line:
#                     continue
                
#                 # Add the line as part of the current speaker's speech
#                 sentences.append(f"({current_speaker}): {line}")
    
#     return sentences

# def is_potential_speaker(line):
#     return bool(re.match(r'.+:\s*$', line))