# Finding the most likely ancestor for a derivative (descendat) file and quantifying the degree of modification

In [4]:
from processors import DocProcessor
import os

In [11]:
template_dir = '/media/mm/DEXP C100/User/Earn/Translate/Thai/Translit/Marriage Starodub/templates'
output_dir = '/media/mm/DEXP C100/User/Earn/Translate/Thai/Translit/Marriage Starodub/out'

In [16]:
def find_best_match(template: os.DirEntry, derivatives: list[os.DirEntry]):
    best_match = None
    best_score = 0
    template_docx = DocProcessor(template.path)
    template_set = set(template_docx.paragraphs)
    for derivative in derivatives:
        docx = DocProcessor(derivative.path)
        docx_set = set(docx.paragraphs)
        score = len(template_set & docx_set) / len(template_set | docx_set)
        if score > best_score:
            best_match = derivative
            best_score = score
    return best_match, best_score

In [17]:
templates = os.scandir(template_dir)
derivatives = [entry for entry in os.scandir(output_dir) if entry.name.endswith('.docx')]

for template in templates:
    print(template.name)
    best_match, score = find_best_match(template, derivatives)
    print(best_match.name, score if best_match else 'No match')

page_1.docx
PHOTO-2025-11-17-20-01-57.jpg.RU.docx 0.22580645161290322
page_2.docx
PHOTO-2025-11-17-20-01-58-1.jpg.RU.docx 0.33783783783783783
page_3.docx
PHOTO-2025-11-17-20-01-58-2.jpg.RU.docx 0.1


In [None]:
# TODO: consider integrating all this into the Processors
class DescendantAnalyzer:
    def __init__(self, docx: os.DirEntry):
        self.docx = docx
        self.document = DocProcessor(docx.path)

    def find_ancestor(self, ancestors: list[os.DirEntry], threshold: float = 0):
        self.ancestor = None
        best_score = 0
        descendat_pars = set(self.document.paragraphs)
        for ancestor in ancestors:
            ancestor_document = DocProcessor(ancestor.path)
            ancestor_pars = set(ancestor_document.paragraphs)
            score = len(ancestor_pars & descendat_pars) / len(descendat_pars | descendat_pars)
            if score > best_score and score > threshold:
                self.ancestor = ancestor
                best_score = score
                self.jaccard_score = score
                self.ancestor_document = ancestor_document

    def get_new_pars(self):
        self.new_pars = set(self.document.paragraphs) - set(self.ancestor_document.paragraphs
            ) if self.ancestor is not None else set(self.document.paragraphs)

    def get_word_counts(self):
        self.added_word_count = sum([len(par.split()) for par in self.new_pars])
        self.total_word_count = sum([len(par.split()) for par in self.document.paragraphs])

    def get_cost(self, rate: float):
        self.get_new_pars()
        self.get_word_counts()
        self.cost = self.added_word_count * rate + (self.total_word_count - self.added_word_count) * rate * 0.1

In [41]:
ancestors_dir = '/media/mm/DEXP C100/User/Earn/Translate/Thai/Translit/Marriage Starodub/templates'
descendants_dir = '/media/mm/DEXP C100/User/Earn/Translate/Thai/Translit/Marriage Starodub/out'
ancestors = [entry for entry in os.scandir(ancestors_dir) if entry.name.endswith('.docx')]
descendants = [entry for entry in os.scandir(descendants_dir) if entry.name.endswith('.docx')]

In [42]:
for descendant in descendants:
    print(descendant.name)
    analyzer = DescendantAnalyzer(descendant)
    analyzer.find_ancestor(ancestors)
    if analyzer.ancestor:
        print(analyzer.ancestor.name, analyzer.jaccard_score)
        analyzer.get_new_pars()
        print(len(analyzer.new_pars))
        analyzer.get_word_counts()
        print(analyzer.total_word_count, analyzer.added_word_count)
        analyzer.get_cost(2.5)
        print(analyzer.cost)
        print()


PHOTO-2025-11-17-20-01-57.jpg.RU.docx
page_1.docx 0.35
13
67 46
120.25

PHOTO-2025-11-17-20-01-58-1.jpg.RU.docx
page_2.docx 0.5
25
182 119
313.25

PHOTO-2025-11-17-20-01-58-2.jpg.RU.docx
page_3.docx 0.2857142857142857
10
269 234
593.75

PHOTO-2025-11-17-20-01-58-3.jpg.RU.docx
page_2.docx 0.15151515151515152
28
129 122
306.75

PHOTO-2025-11-17-20-01-58.jpg.RU.docx
page_1.docx 0.0625
15
51 50
125.25



In [46]:
analyzers = [DescendantAnalyzer(descendant) for descendant in descendants]

In [47]:
for analyzer in analyzers:
    print(analyzer.docx.name)
    analyzer.find_ancestor(ancestors, threshold=0.2)
    if analyzer.ancestor:
        print(analyzer.ancestor.name, analyzer.jaccard_score)
        analyzer.get_new_pars()
        print(len(analyzer.new_pars))
        analyzer.get_word_counts()
        print(analyzer.total_word_count, analyzer.added_word_count)
        analyzer.get_cost(2.5)
        print(analyzer.cost)
    else:
        print('No apparent ancestor found')
    print()


PHOTO-2025-11-17-20-01-57.jpg.RU.docx
page_1.docx 0.35
13
67 46
120.25

PHOTO-2025-11-17-20-01-58-1.jpg.RU.docx
page_2.docx 0.5
25
182 119
313.25

PHOTO-2025-11-17-20-01-58-2.jpg.RU.docx
page_3.docx 0.2857142857142857
10
269 234
593.75

PHOTO-2025-11-17-20-01-58-3.jpg.RU.docx
No apparent ancestor found

PHOTO-2025-11-17-20-01-58.jpg.RU.docx
No apparent ancestor found



In [48]:
label = filter(lambda x: x.docx.name == 'PHOTO-2025-11-17-20-01-58.jpg.RU.docx', analyzers).__next__()
label.docx.name

'PHOTO-2025-11-17-20-01-58.jpg.RU.docx'

In [49]:
label.get_cost(rate=1.5)
label.cost

76.5

In [50]:
last_page = filter(lambda x: x.docx.name == 'PHOTO-2025-11-17-20-01-58-3.jpg.RU.docx', analyzers).__next__()
last_page.docx.name

'PHOTO-2025-11-17-20-01-58-3.jpg.RU.docx'

In [52]:
last_page.find_ancestor([descendant for descendant in descendants if descendant.name != last_page.docx.name])
last_page.ancestor.name, last_page.jaccard_score

('PHOTO-2025-11-17-20-01-58-1.jpg.RU.docx', 0.48484848484848486)

In [54]:
last_page.get_cost(rate=2.5)
last_page.cost

212.25

In [58]:
for analyzer in analyzers:
    print(analyzer.docx.name, analyzer.cost)

PHOTO-2025-11-17-20-01-57.jpg.RU.docx 120.25
PHOTO-2025-11-17-20-01-58-1.jpg.RU.docx 313.25
PHOTO-2025-11-17-20-01-58-2.jpg.RU.docx 593.75
PHOTO-2025-11-17-20-01-58-3.jpg.RU.docx 212.25
PHOTO-2025-11-17-20-01-58.jpg.RU.docx 76.5


In [59]:
sum(analyzer.cost for analyzer in analyzers)

1316.0