# Match validation data with parsed reports

In [None]:
import json
import os
from Levenshtein import distance

## Load expert selections and parsing results

In [None]:
VALIDATION_DIRECTORY = '00_data/validation_data'

# Load validation data from json file
with open('{}/{}'.format(VALIDATION_DIRECTORY, 'validation_paragraphs.json'), encoding="utf-8") as f:
    val = json.load(f)

PARSED_DIRECTORY = '00_data/parsing_results'
PARSING_RESULTS = (file for file in os.listdir(PARSED_DIRECTORY) if (os.path.isfile(os.path.join(PARSED_DIRECTORY, file)) and file.endswith(".json")))

# Load all parsed reports into one dictionary
pars = {}
for file in PARSING_RESULTS:
    parsing_method = file.split('_')[0]
    company = file.split('_')[1]
    with open('{}/{}'.format(PARSED_DIRECTORY, file)) as f:
        parsed_text = json.load(f)
        if parsing_method not in pars:
            pars[parsing_method] = {}
        pars[parsing_method][company] = parsed_text

In [None]:
# parsing_method specifies what kind of parsing method should be used (pdfminer or easyocr)
parsing_method = 'pdfminer'

# minimum length of a sentence to be considered
min_sentence_length = 15

# this threshold depicts the maximum normalized levensthein distance between a validation sentence and a parsed sentence in order for a match
# The levenshtein distance will me normalized (levenshtein distance)/(length of sentence)
max_dist_threshold = 0.05

# relevant_sentences will contain all sentences that were matched
relevant_sentences = {}

# Match data for all reports and all validation data
for company, topics in val.items():

    relevant_sentences[company] = {}
    report_text = pars[parsing_method][company]

    # Iterate over all validation data
    for topic, val_paragraphs in topics.items():
        relevant_sentences[company][topic] = []

        # Iterate over each paragraph
        for val_paragraph in val_paragraphs:

            # Compare each sentence in parsed reports
            for rep_par_idx, rep_par_sentences in report_text.items():
                for rep_par_sent_idx, rep_par_sent in rep_par_sentences.items():

                    # Initialize min_par_sim with maximum possible normalized levenshtein distance
                    min_sim = 1

                    # Only compare paragraphs if it has the minimal required length
                    if len(rep_par_sent) >= min_sentence_length:
                        start_idx = 0
                        end_idx = len(rep_par_sent)

                        # Iterate over all possible substrings of a paragraph and compute minimal possible levenshtein distance
                        while end_idx < len(val_paragraph):

                            test_str = val_paragraph[start_idx:end_idx]
                            dist = distance(rep_par_sent, test_str) / len(rep_par_sent)

                            # Only keep minimal levenshtein distance of any substring
                            if dist < min_sim:
                                min_sim = dist

                            start_idx += 1
                            end_idx += 1

                        # If any substring had normalized levenshtein distance below the threshold, add it to the matched data
                        if min_sim <= max_dist_threshold:
                            relevant_sentences[company][topic].append((rep_par_idx, rep_par_sent_idx))

# Save results to json file
with open('{}/{}_{}.json'.format(VALIDATION_DIRECTORY, parsing_method, 'matched_sentences'), 'w') as f:
    json.dump(relevant_sentences, f, indent=4)