In [1]:
from docx import Document
import os
import re
import json

In [None]:
def process_docx(file_path, output_dir):
    # Extract the base name of the input file (without the path)
    file_name = os.path.basename(file_path)
    # Change the extension from .docx to .txt for the output file
    output_text_file = os.path.join(output_dir, file_name.replace('.docx', '.txt'))

    # Process the document and write to the output file
    document = Document(file_path)
    with open(output_text_file, 'w', encoding='utf-8') as f:
        for para in document.paragraphs:
            if para.text.strip():  # Write non-empty paragraphs
                f.write(para.text + '\n')


def process_all_docx_in_directory(directory_path):
    output_dir = 'output'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file_name in os.listdir(directory_path):
        if file_name.endswith('.docx'):
            file_path = os.path.join(directory_path, file_name)
            process_docx(file_path, output_dir)
            print(f'Processed: {file_path}')

In [None]:
dir = 'protocols'
output_file = 'out.txt'
process_all_docx_in_directory(dir)

In [None]:
# def process_sentences(file_path):
#     """
#     Process the file to extract sentences and associate them with speakers.
#     """
#     with open(file_path, 'r', encoding='utf-8') as f:
#         lines = f.readlines()
    
#     sentences = []
#     current_speaker = None

#     for line in lines:
#         line = line.strip()
#         if not line:
#             continue
        
#         # Check for speaker lines (heuristic: contains ":" at the start)
#         if line.startswith("<< דובר") or line.startswith("<< יור") or line.startswith("<< אורח"):
#             match = re.search(r'>> (.*?): <<', line)
#             if match:
#                 current_speaker = match.group(1)
#         else:
#             # Tokenize sentences
#             for sentence in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', line):
#                 if sentence.strip():
#                     sentences.append((current_speaker, sentence.strip()))

#     return sentences

# def save_to_jsonl(file_path, output_file):
#     """
#     Save the extracted data to a JSONL file.
#     """
#     name_protocol, number_knesset, type_protocol, number_protocol = parse_metadata(file_path)
#     sentences = process_sentences(file_path)

#     with open(output_file, 'w', encoding='utf-8') as f:
#         for speaker, sentence in sentences:
#             json_line = {
#                 "name_protocol": name_protocol,
#                 "number_knesset": number_knesset,
#                 "type_protocol": type_protocol,
#                 "number_protocol": number_protocol,
#                 "name_speaker": speaker,
#                 "text_sentence": sentence
#             }
#             json.dump(json_line, f, ensure_ascii=False)
#             f.write('\n')

# file_path = 'output/25_ptv_3841247.txt'
# output_file = 'corpus.jsonl'
# save_to_jsonl(file_path, output_file)
# print(f"Processed and saved to {output_file}")


In [2]:
def hebrew_to_number(hebrew_text):
    hebrew_numerals = {
        "אחד": 1, "שתיים": 2, "שלוש": 3, "ארבע": 4, "חמש": 5,
        "שש": 6, "שבע": 7, "שמונה": 8, "תשע": 9, "עשר": 10,
        "עשרים": 20, "שלושים": 30, "ארבעים": 40, "חמישים": 50,
        "שישים": 60, "שבעים": 70, "שמונים": 80, "תשעים": 90,
        "מאה": 100, "מאתיים": 200
    }

    suffixes = {
        "מאות": 100, "עשרה" : 10
    }

    total = 0
    parts = re.split(r'-', hebrew_text)
    # by now we should have an array of the hebrew numbers/suffixes

    for i, part in enumerate(parts):
        part = part.strip()
        
        if part.startswith('ו'):
            part = part[1:]

        if part in hebrew_numerals:
            if i + 1 < len(parts) and parts[i+1] in suffixes:
                if parts[i+1] == "מאות":
                    total += hebrew_numerals[part] * suffixes[parts[i+1]]
                elif parts[i+1] == "עשרה":
                    total += hebrew_numerals[part] + suffixes[parts[i+1]]
                continue
            else:
                total += hebrew_numerals[part]
                
    return total

In [3]:
def parse_txt_metadata(file_path):
    filename = file_path.split('/')[-1]

    # knesset num
    knesset_num = filename.split('_')[0]

    # type
    if filename.split('_')[1][2] == 'm':
        protocol_type = 'plenary'
    elif filename.split('_')[1][2] == 'v':
        protocol_type = 'committee'

    # protocol number
    protocol_number = -1

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    for line in lines[:500]:
        if "פרוטוקול מס'" in line:
            match = re.search(r"פרוטוקול מס'\s*(\d+)", line)
            if match:
                protocol_number = match.group(1)
        if re.search(r"הישיבה ה.*? של הכנסת ה.*", line):
            match = re.search(r"הישיבה ה(.*?) של", line)           # get the hebrew number only
            hebrew_number = match.group(1).strip()
            protocol_number = hebrew_to_number(hebrew_number)       # use a function to determine the number
    
    return knesset_num, protocol_type, protocol_number

In [7]:
def extract_sentences(file_path):
    sentences = []
    current_speaker = None

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue

            match = re.search(r'<<.*?>>\s*(.*?):\s*<<.*?>>', line)
            if match:
                speaker_raw = match.group(1)
                current_speaker = clean_speaker_name(speaker_raw)
                speech = line.split("<<", 1)[0].split(":", 1)[-1].strip()
                if speech:
                    sentences.append(f"({current_speaker}): {speech}")
                continue

            if ':' in line and is_potential_speaker(line):
                parts = line.split(':', 1)
                speaker_raw = parts[0].strip()
                current_speaker = clean_speaker_name(speaker_raw)
                spoken_text = parts[1].strip()
                if spoken_text:
                    sentences.append(f"({current_speaker}): {spoken_text}")
                continue  # Skip to the next line

            if current_speaker:
                # Skip lines containing the "<<סיום>>" tag
                if '<<סיום>>' in line:
                    continue
                
                # Add the line as part of the current speaker's speech
                sentences.append(f"({current_speaker}): {line}")
    
    return sentences

def clean_speaker_name(raw_name):
    cleaned_name = re.sub(r'\(.*?\)', '', raw_name)
    cleaned_name = re.sub(r'היו"ר|יו"ר|ח"כ|ד"ר|פרופ\'|מר|גברת', '', cleaned_name)
    cleaned_name = re.sub(r'<<.*?>>', '', cleaned_name)
    return cleaned_name.strip()

def is_potential_speaker(line):
    return bool(re.match(r'.+:\s*$', line))

In [8]:
file_path = 'output/25_ptv_3841247.txt'
knesset_num, protocol_type, protocol_number = parse_txt_metadata(file_path)

print(f'Knesset Number: {knesset_num}')
print(f'Protocol Type: {protocol_type}')
print(f'Protocol Number: {protocol_number}')

Knesset Number: 25
Protocol Type: committee
Protocol Number: 33


In [9]:
sens = extract_sentences(file_path)

sens_file = 'sens.txt'
with open(sens_file, 'w', encoding='utf-8') as f:
        for sen in sens:
            f.write(sen + '\n')