In [3]:
from docx import Document
import os
import re
import json

In [2]:
def process_docx(file_path, output_dir):
    # Extract the base name of the input file (without the path)
    file_name = os.path.basename(file_path)
    # Change the extension from .docx to .txt for the output file
    output_text_file = os.path.join(output_dir, file_name.replace('.docx', '.txt'))

    # Process the document and write to the output file
    document = Document(file_path)
    with open(output_text_file, 'w', encoding='utf-8') as f:
        for para in document.paragraphs:
            if para.text.strip():  # Write non-empty paragraphs
                f.write(para.text + '\n')


def process_all_docx_in_directory(directory_path):
    output_dir = 'output'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file_name in os.listdir(directory_path):
        if file_name.endswith('.docx'):
            file_path = os.path.join(directory_path, file_name)
            process_docx(file_path, output_dir)
            print(f'Processed: {file_path}')

In [None]:
dir = 'protocols'
output_file = 'out.txt'
process_all_docx_in_directory(dir)

In [None]:
# metadata of a file
def parse_metadata(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Extract name_protocol
    name_protocol = file_path.split('/')[-1]  # extract file name
    
    # Extract number_knesset and type_protocol from the header
    number_knesset = None
    type_protocol = None
    number_protocol = None

    for line in lines[:10]:
        if "פרוטוקול מס'" in line:
            match = re.search(r"פרוטוקול מס'\s*(\d+)", line)
            if match:
                number_protocol = match.group(1)
    
    return name_protocol, number_knesset, type_protocol, number_protocol

def process_sentences(file_path):
    """
    Process the file to extract sentences and associate them with speakers.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    sentences = []
    current_speaker = None

    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        # Check for speaker lines (heuristic: contains ":" at the start)
        if line.startswith("<< דובר") or line.startswith("<< יור") or line.startswith("<< אורח"):
            match = re.search(r'>> (.*?): <<', line)
            if match:
                current_speaker = match.group(1)
        else:
            # Tokenize sentences
            for sentence in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', line):
                if sentence.strip():
                    sentences.append((current_speaker, sentence.strip()))

    return sentences

def save_to_jsonl(file_path, output_file):
    """
    Save the extracted data to a JSONL file.
    """
    name_protocol, number_knesset, type_protocol, number_protocol = parse_metadata(file_path)
    sentences = process_sentences(file_path)

    with open(output_file, 'w', encoding='utf-8') as f:
        for speaker, sentence in sentences:
            json_line = {
                "name_protocol": name_protocol,
                "number_knesset": number_knesset,
                "type_protocol": type_protocol,
                "number_protocol": number_protocol,
                "name_speaker": speaker,
                "text_sentence": sentence
            }
            json.dump(json_line, f, ensure_ascii=False)
            f.write('\n')

file_path = 'output/25_ptv_3841247.txt'
output_file = 'corpus.jsonl'
save_to_jsonl(file_path, output_file)
print(f"Processed and saved to {output_file}")


In [9]:
class Protocol:
    def __init__(self, knesset_num, type, protocol_num):
        self.knesset_num = knesset_num
        self.type = type
        self.protocol_num = protocol_num

In [24]:
file_path = 'output/25_ptv_3841247.txt'
filename = file_path.split('/')[-1]
data = []

# knesset num
number_knesset = filename.split('_')[0]
data.append(number_knesset)

# type
if filename.split('_')[1][2] == 'm':
    type_protocol = 'plenary'
elif filename.split('_')[1][2] == 'v':
    type_protocol = 'committee'
data.append(type_protocol)

# protocol number
number_protocol = None

with open(file_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

for line in lines[:10]:
    if "פרוטוקול מס'" in line:
        match = re.search(r"פרוטוקול מס'\s*(\d+)", line)
        if match:
            number_protocol = match.group(1)
data.append(number_protocol)

data

['25', 'committee', '33']