# Similarity Analysis using Sentence Transformer Encodings

In [1]:
import json
import scipy
import os
import random

ENCODINGS_DIRECTORY = '00_data/encoding_results'
REPORT_ENCODINGS = (file for file in os.listdir(ENCODINGS_DIRECTORY) if (os.path.isfile(os.path.join(ENCODINGS_DIRECTORY, file)) and file.endswith(".json")))
QUERY_ENCODINGS = '00_data/encoding_results/all-mpnet-base-v2_query_sentences.json'

RESULTS_DIRECTORY = '00_data/encoding_based_similarity_analysis_results'

PARSING_METHOD = 'pdfminer'

# THRESHOLD defines how big the cosine similarity of a report sentence and a topical sentence should be to be deemed as relevant
THRESHOLD = 0.65

# SAMPLE can be used to sample from the topical sentences 1 means that all topical sentences should be used
# TODO: This repository does not contain yet an analysis of sampled topical sentences
SAMPLE = 1
         
# result is a dictionry that will contain all sentences classified as relevant
result = {}

print("Start similarity analysis...")

# Load topical sentences
with open(QUERY_ENCODINGS) as f:
    query_encodings = json.load(f)

# Sample topical sentences
for topic, topic_values in query_encodings.items():
    new_vals = random.sample(topic_values, int(len(topic_values)*SAMPLE))
    query_encodings[topic] = new_vals

# Process all encoded reports
for file in REPORT_ENCODINGS:
    
    # Only use reports with the specified parsing method (pdfminer or easyocr)
    if file.split('_')[1] == PARSING_METHOD:
        company = file.split('_')[2]
        
        # add empty dictionary to dictionary result for each company
        if company not in result:
            result[company] = {}
            
        print(f"... for {company} report")
            
        # Load encodings of reports
        with open('{}/{}'.format(ENCODINGS_DIRECTORY, file)) as f:
            company_report_encodings = json.load(f)
            
            # For each topic, calculate cosine similarity
            for topic, topic_values in query_encodings.items():

                # Add list to result dictionary for each topic
                if topic not in result[company]:
                    result[company][topic] = []
                
                for topic_value in topic_values:   
                    
                    # Calculate cosine similarity for each sentence in each paragraph
                    for paragraph_key, paragraph_values in company_report_encodings.items():
                        for sentence_key, sentence_value in paragraph_values.items():

                            # Calculate cosine similarity
                            distance = scipy.spatial.distance.cdist([topic_value], [sentence_value], "cosine")[0]

                            # Add sentence to results if cosine similarity is above threshold
                            score = 1-distance[0]
                            if score > THRESHOLD:
                                result[company][topic].append((paragraph_key, sentence_key, score))

# Save results to a dictionary
with open('{}/{}_{}_th={}_sample={}.json'.format(RESULTS_DIRECTORY, PARSING_METHOD, 'found_sentences', THRESHOLD, SAMPLE), 'w') as f:
    json.dump(result, f, indent=4)

print(f"Done. Results saved in {RESULTS_DIRECTORY}")
    

Start similarity analysis...
... for Arbonia report
... for Firmenich report
... for Nestle report
Done. Results saved in 00_data/encoding_based_similarity_analysis_results
