In [None]:
from docx import Document
import os
import re
import json

In [None]:
def process_docx(file_path, output_dir):
    # Extract the base name of the input file (without the path)
    file_name = os.path.basename(file_path)
    # Change the extension from .docx to .txt for the output file
    output_text_file = os.path.join(output_dir, file_name.replace('.docx', '.txt'))

    # Process the document and write to the output file
    document = Document(file_path)
    with open(output_text_file, 'w', encoding='utf-8') as f:
        for para in document.paragraphs:
            if para.text.strip():  # Write non-empty paragraphs
                f.write(para.text + '\n')

def process_all_docx_in_directory(directory_path):
    output_dir = 'output'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file_name in os.listdir(directory_path):
        if file_name.endswith('.docx'):
            file_path = os.path.join(directory_path, file_name)
            process_docx(file_path, output_dir)
            print(f'Processed: {file_path}')

In [None]:
# dir = 'protocols'
# output_file = 'out.txt'
# process_all_docx_in_directory(dir)

In [None]:
def hebrew_to_number(hebrew_text):
    hebrew_numerals = {
        "אחד": 1, "שתיים": 2, "שלוש": 3, "ארבע": 4, "חמש": 5,
        "שש": 6, "שבע": 7, "שמונה": 8, "תשע": 9, "עשר": 10,
        "עשרים": 20, "שלושים": 30, "ארבעים": 40, "חמישים": 50,
        "שישים": 60, "שבעים": 70, "שמונים": 80, "תשעים": 90,
        "מאה": 100, "מאתיים": 200
    }

    suffixes = {
        "מאות": 100, "עשרה" : 10
    }

    total = 0
    parts = re.split(r'-', hebrew_text)
    # by now we should have an array of the hebrew numbers/suffixes

    for i, part in enumerate(parts):
        part = part.strip()
        
        if part.startswith('ו'):
            part = part[1:]

        if part in hebrew_numerals:
            if i + 1 < len(parts) and parts[i+1] in suffixes:
                if parts[i+1] == "מאות":
                    total += hebrew_numerals[part] * suffixes[parts[i+1]]
                elif parts[i+1] == "עשרה":
                    total += hebrew_numerals[part] + suffixes[parts[i+1]]
                continue
            else:
                total += hebrew_numerals[part]
                
    return total

In [None]:
def parse_protocol_metadata(file_path):
    filename = file_path.split('/')[-1]

    ## knesset num ##
    knesset_num = filename.split('_')[0]
    knesset_num = knesset_num.split('\\')[-1]

    ## type ##
    if filename.split('_')[1][2] == 'm':
        protocol_type = 'plenary'
    elif filename.split('_')[1][2] == 'v':
        protocol_type = 'committee'

    ## protocol number ##
    protocol_number = -1

    document = Document(file_path)
    lines = [para.text for para in document.paragraphs]

    for line in lines[:500]:
        if "פרוטוקול מס'" in line:
            match = re.search(r"פרוטוקול מס'\s*(\d+)", line)
            if match:
                protocol_number = match.group(1)
        if re.search(r"הישיבה ה.*? של הכנסת ה.*", line):
            match = re.search(r"הישיבה ה(.*?) של", line)           # get the hebrew number only
            hebrew_number = match.group(1).strip()                  # extract the text
            protocol_number = hebrew_to_number(hebrew_number)       # use a function to determine the number
    
    return knesset_num, protocol_type, protocol_number

In [None]:
file_path = 'protocols/20_ptm_313902.docx'
document = Document(file_path)

# Extract text from paragraphs
for paragraph in document.paragraphs:
    if paragraph.text.strip():  # Ignore empty paragraphs
        print(paragraph.style.font.underline)

In [None]:
def produce_corpus(dir_path):
    for curr_file in os.listdir(dir_path):
        file_path = os.path.join(dir_path, os.fsdecode(curr_file))
        knesset_num, protocol_type, protocol_num = parse_protocol_metadata(file_path)

In [None]:
dir = 'protocols'
produce_corpus(dir)

In [None]:
# dir_path = 'output'

# for file in os.listdir(dir_path):
#     file_path = os.path.join(dir_path, os.fsdecode(file))
#     print(f'{file}: {parse_txt_metadata(file_path)[2]}')

In [None]:
def extract_sentences(file_path):
    sentences = []
    current_speaker = None
    meeting_started = False

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue

            # check if the meeting has started
            if re.search(r'היו"ר .*?:', line) or re.search(r'<< דובר >> .*?:', line):
                meeting_started = True  # Set the flag to True
                if '<< דובר >>' in line:
                    speaker_raw = line.split('>>', 1)[-1].split(':', 1)[0].strip()
                else:
                    speaker_raw = line.split(':', 1)[0].replace("היו\"ר", "").strip()
                current_speaker = clean_speaker_name(speaker_raw)
                continue

            if not meeting_started:
                continue

            match = re.search(r'<<.*?>>\s*(.*?):\s*<<.*?>>', line)
            if match:
                speaker_raw = match.group(1)
                current_speaker = clean_speaker_name(speaker_raw)
                speech = line.split("<<", 1)[0].split(":", 1)[-1].strip()
                if speech:
                    sentences.append(f"({current_speaker}): {speech}")
                continue

            if ':' in line and is_potential_speaker(line):
                parts = line.split(':', 1)
                speaker_raw = parts[0].strip()
                current_speaker = clean_speaker_name(speaker_raw)
                spoken_text = parts[1].strip()
                if spoken_text:
                    sentences.append(f"({current_speaker}): {spoken_text}")
                continue  # Skip to the next line

            if current_speaker:
                # Skip lines containing the "<<סיום>>" tag
                if '<< סיום >>' in line:
                    continue
                
                # Add the line as part of the current speaker's speech
                sentences.append(f"({current_speaker}): {line}")
    
    return sentences

def clean_speaker_name(raw_name):
    prefixes_to_remove = [
        r'היו"ר', r'יו"ר', r'ח"כ', r'ד"ר', r'פרופ\'', r'מר', r'גברת', 
        r'שר הבינוי והשיכון',
        r'שר העבודה, הרווחה והשירותים החברתיים'
    ]

    # Create a regex pattern to match any of the prefixes
    prefix_pattern = r'|'.join(prefixes_to_remove)

    cleaned_name = re.sub(r'\(.*?\)', '', raw_name)
    cleaned_name = re.sub(prefix_pattern, '', cleaned_name)
    cleaned_name = re.sub(r'<<.*?>>', '', cleaned_name)
    return cleaned_name.strip()

def is_potential_speaker(line):
    return bool(re.match(r'.+:\s*$', line))

In [None]:
file_path = 'output/20_ptv_490139.txt'
knesset_num, protocol_type, protocol_number = parse_protocol_metadata(file_path)

print(f'Knesset Number: {knesset_num}')
print(f'Protocol Type: {protocol_type}')
print(f'Protocol Number: {protocol_number}')

In [None]:
sens = extract_sentences(file_path)

sens_file = 'sens.txt'
with open(sens_file, 'w', encoding='utf-8') as f:
        for sen in sens:
            f.write(sen + '\n')