In [None]:
import json
import scipy
import os
import random

ENCODINGS_DIRECTORY = '00_data/encoding_results'
REPORT_ENCODINGS = (file for file in os.listdir(ENCODINGS_DIRECTORY) if (os.path.isfile(os.path.join(ENCODINGS_DIRECTORY, file)) and file.endswith(".json")))
QUERY_ENCODINGS = '00_data/encoding_results/all-mpnet-base-v2_query_sentences.json'

RESULTS_DIRECTORY = '00_data/encoding_based_similarity_analysis_results'

PARSING_METHOD = 'pdfminer'

THRESHOLD = 0.65
SAMPLE = 1
         
result = {}

print("Start similarity analysis...")

with open(QUERY_ENCODINGS) as f:
    query_encodings = json.load(f)

for topic, topic_values in query_encodings.items():
    new_vals = random.sample(topic_values, int(len(topic_values)*SAMPLE))
    query_encodings[topic] = new_vals

for file in REPORT_ENCODINGS:
    if file.split('_')[1] == PARSING_METHOD:
        company = file.split('_')[2]
        
        if company not in result:
            result[company] = {}
            
        print(f"... for {company} report")
            
        with open('{}/{}'.format(ENCODINGS_DIRECTORY, file)) as f:
            company_report_encodings = json.load(f)
            
            for topic, topic_values in query_encodings.items():
                if topic not in result[company]:
                    result[company][topic] = []
                    
                for topic_value in topic_values:   
                    
                    for paragraph_key, paragraph_values in company_report_encodings.items():
                        for sentence_key, sentence_value in paragraph_values.items():

                            distance = scipy.spatial.distance.cdist([topic_value], [sentence_value], "cosine")[0]

                            score = 1-distance[0]
                            if score > THRESHOLD:
                                result[company][topic].append((paragraph_key, sentence_key, score))


with open('{}/{}_{}_th={}_sample={}.json'.format(RESULTS_DIRECTORY, PARSING_METHOD, 'found_sentences', THRESHOLD, SAMPLE), 'w') as f:
    json.dump(result, f, indent=4)

print(f"Done. Results saved in {RESULTS_DIRECTORY}")
    