# Encode with SentenceTransformers

In [1]:
import re
import os
import json

# Import the model all-mpnet-base-v2 using the transformer library
from sentence_transformers import SentenceTransformer
MODEL = 'all-mpnet-base-v2'
ENCODER = SentenceTransformer('all-mpnet-base-v2') 

def encode(input): 
    return ENCODER.encode(input, show_progress_bar = True)

In [2]:
PARSED_DIRECTORY = '00_data/parsing_results'
NLP_RESULTS_DIRECTORY = '00_data/encoding_results'
REPORTS = [f for f in os.listdir(PARSED_DIRECTORY) if f[-4:] == 'json']

## Encoding reports

In [3]:
# Encode each sentence of the parsed reports
for file in REPORTS:
    encodings = {}
    
    # Read encoded report from json file
    with open('{}/{}'.format(PARSED_DIRECTORY, file)) as f:
        parsed_text = json.load(f)

    # To speed up the process, sentences are stored as a list

    # keys contains the paragraph and sentence id
    keys = [] 
    # values contains the sentences to be encoded
    values = []

    # Add sentences and keys to respective lists
    for paragraph_key, paragraph_value in parsed_text.items():

        for sentence_key, sentence_value in paragraph_value.items():
            keys.append([paragraph_key, sentence_key])
            values.append(sentence_value)
    
    # Encode sentences
    values = encode(values)

    # Transform encodings in list to a dictionary with the same structure as the dictionary containing the sentences
    for key, value in zip(keys, values):
        if key[0] not in encodings:
            encodings[key[0]] = {}

        encodings[key[0]][key[1]] = value.tolist()

    # Save dictionary to json file
    with open('{}/{}_{}.json'.format(NLP_RESULTS_DIRECTORY, MODEL, file[:-5]), 'w') as f:
        json.dump(encodings, f, indent=4)   


Batches:   0%|          | 0/50 [00:00<?, ?it/s]

Batches:   0%|          | 0/89 [00:00<?, ?it/s]

Batches:   0%|          | 0/213 [00:00<?, ?it/s]

Batches:   0%|          | 0/140 [00:00<?, ?it/s]

## Encode topical sentences (queries)

In [4]:
QUERIES_PATH = '00_data/queries'
FILENAME = 'query_sentences.json'
with open('{}/{}'.format(QUERIES_PATH, FILENAME)) as f:
    queries = json.load(f)

encodings = {}

for key, value in queries.items():
    sentences = [text for text in value]
    sentences = encode(sentences)

    encodings[key] = sentences.tolist()

with open('{}/{}_{}.json'.format(NLP_RESULTS_DIRECTORY, MODEL, FILENAME[:-5]), 'w') as f:
       json.dump(encodings, f, indent=4)

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]