# Variant detector in the Ben Sira manuscripts

In [None]:
class VariantDetector:
    """
    
    """

In [4]:
import re
import os
from bs4 import BeautifulSoup, Tag
from collatex import *

In [5]:
manuscript_files = [file for file in os.listdir("../data/") if file.startswith("ms")]

In [45]:
content = {}


for manuscript_file in manuscript_files:
    
    # Claqué au sol
    manuscript_name = manuscript_file.replace("ms_","").replace(".xml","")
    
    if manuscript_name in ["c", "2Q18"]: # Not for manuscripts c and 2q18
        continue
        
    print("====================== working on =====================")
    print(manuscript_name)
    
    # Load data
    with open(f"../data/{manuscript_file}") as f:
        manuscript = f.read()
        
    parsed = BeautifulSoup(manuscript, features="xml")
    
    if not content.get(manuscript_name):
        content[manuscript_name] = {}
    
    for link in parsed.find_all():
    
        if link.name == "chap":
            current_chapter = re.findall(r'\d+', link.text)[-1].strip()

            if not content.get(current_chapter):
                content[manuscript_name][current_chapter] = {}

        elif link.name == "text" and link.text:
            tags = [children.name for children in link]
            if "verse_nb" in tags:
                verse_nbr = link.verse_nb.text.strip()
                text_content = link.find_all(text=True, recursive=False)
                # Join text in reverse order
                text =' '.join(text_content).strip()
                content[manuscript_name][current_chapter][verse_nbr] = text
            elif verse_nbr:
                content[manuscript_name][current_chapter][verse_nbr] += ' '.join(link.find_all(text=True, recursive=False))

masada
11QPsa
a
f
d
e
b


In [46]:
TOTAL_CHAPTERS = 51

collated_content = {}
for chapter in range(TOTAL_CHAPTERS):
    
    collated_content[str(chapter)] = {}
    
    for manuscript_id in content.keys():
        
        if content[manuscript_id].get(str(chapter)):
            
            for verse_nb, verse_content in content[manuscript_id][str(chapter)].items():
                
                if not collated_content[str(chapter)].get(verse_nb):
                    collated_content[str(chapter)][verse_nb] = {manuscript_id: verse_content}
                else:
                    collated_content[str(chapter)][verse_nb][manuscript_id] = verse_content
            

In [117]:
nbr_overlap = 0
variants = {}


for chapter in collated_content.keys():
    for verses in collated_content[chapter].keys():
        
        if len(collated_content[chapter][verses].keys()) > 1 and verses:
            
            manuscripts_to_collate = collated_content[chapter][verses]
            
            nbr_overlap += 1
            collation = Collation()

            print(f"======= chapter {chapter} ==========")
            print(f"======= verse {verses} ==========")

            for witness_name, witness_content in manuscripts_to_collate.items():
                if witness_content:
                    collation.add_plain_witness(witness_name, witness_content.replace("\u030a", "").replace("\u0307", ""))
            alignment_table = collate(collation)#, output="html2",  layout="vertical")

            if not variants.get(chapter):
                variants[chapter] = {}
            variants[chapter][verses] = {}
            for column in alignment_table.columns:
                if column.variant:
                    # Add to variants the tokens as strings       
                    for manuscript, tokens in column.tokens_per_witness.items():
                        token_strings = [token.token_string for token in tokens]
                        variants[chapter][verses].update({manuscript: " ".join(token_strings)})





In [221]:
# Store the variants per witness to begin variant classification
# Remove variants with fragmentation
variants_no_fragmentation = {}
for chapter, verses in variants.items():
    if not variants_no_fragmentation.get(chapter):
        variants_no_fragmentation[chapter] = {}
    for verse_nbr, variant in verses.items():
        # Merge variants
        merged_variants = " ".join(variant.values())
        if not ("[" in merged_variants or "]" in merged_variants or "׃" in merged_variants or " ̶" in merged_variants):
            if variant:
                variants_no_fragmentation[chapter][verse_nbr] = variant
            
# Re-loop in case of empty dictionary
variants_no_fragmentation = {key: value for key, value in variants_no_fragmentation.items() if value}

In [223]:
print("==============================")
print("==============================")
print("These are variants not related to fragmentation ")
print("==================================")
print("==================================")

variants_no_fragmentation

These are variants not related to fragmentation 


{'7': {'21': {'a': 'ל', 'd': 'פש אל'},
  '25': {'a': 'חברﬣ', 'd': 'חברה'},
  '31': {'a': 'צוותﬣ', 'd': 'צויתה'},
  '34': {'a': 'אבלים', 'd': 'אבילים'}},
 '8': {'5': {'a': 'ש', 'd': 'אי'},
  '6': {'a': 'נמנﬣ', 'd': 'נמנה'},
  '11': {'a': 'להושיבו כאורב לפניך', 'd': 'פניך'},
  '13': {'a': 'א \u200d ל', 'd': 'אל'},
  '15': {'a': 'א \u200d ל', 'd': 'פן'}},
 '10': {'31': {'a': 'נכבד', 'b': 'בעשרו'}},
 '11': {'3': {'a': 'דברה', 'b': 'דבורה'}, '9': {'a': 'א \u200d ל', 'b': 'אל'}},
 '15': {'1': {'a': 'י י', 'b': 'ייי'},
  '2': {'a': 'וקדמתﬣו כאם', 'b': 'וקדמתהו כ ּ אם'},
  '12': {'a': 'צורך', 'b': 'לי ח ֟ פץ'},
  '16': {'a': 'שלח', 'b': 'תשלח'},
  '17': {'a': 'ומוות אשר יחפץ ינתן', 'b': 'ומות וכל שיחפץ ית ֟ ן'},
  '19': {'a': 'איש', 'b': 'אנוש'},
  '20': {'a': 'החלים אנשי', 'b': 'למד שקרים לאנשי'}},
 '16': {'1': {'a': 'בבני', 'b': 'על בני'},
  '2': {'a': 'י י', 'b': 'ייי'},
  '4': {'a': 'וממשפחת בגדים', 'b': 'וממשפחות בוגדים'},
  '5': {'a': 'ועצ ֻ מו ֹ ת ֿ', 'b': 'ועצומות'},
  '6': {'a': 'חמﬣ'