In [None]:
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
import torch._utils
from transformers import pipeline

#for not seing a warning message
import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)


def get_text_embedding(text, model_name='bert-base-uncased'):
    # Load pre-trained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Tokenize input text and convert to PyTorch tensors
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

    # Get output from pre-trained model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract last layer of output (CLS token) as the text embedding
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()


    return embedding


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def get_similar_paragraphs(query_embedding, paragraph_embeddings, paragraphs, k=5):

    # Compute the cosine similarities between the query embedding and each paragraph embedding
    similarities = [cosine_similarity(query_embedding.reshape(1, -1), embedding.reshape(1, -1)) for embedding in paragraph_embeddings]

    # Get the indices of the top k paragraphs based on their similarity scores
    top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:k]

    # Return the top k paragraphs and their similarity scores as a list dictionaries
    return [{"content": paragraphs[i], "score" :similarities[i]} for i in top_indices]




In [None]:
path = 'Documents/02450_w_form.txt'
with open(path, "r", encoding='utf-8') as file:
    input_text = file.read()

#split into paragraphs
paragraphs = input_text.split("\n\n")
paragraphs = [p for p in paragraphs if len(p) > 50]
#Remove \n and \t and -  from paragraphs:
paragraphs = [p.replace("\n", " ") for p in paragraphs]
paragraphs = [p.replace("\t", " ") for p in paragraphs]
paragraphs = [p.replace("- ", " ") for p in paragraphs]

# keep adding paragraphs together so they each have a minimum length of 700 characters
def add_paragraphs(paragraphs):
    i = 0
    while i < len(paragraphs) - 1:
        if len(paragraphs[i]) < 700:
            paragraphs[i] = paragraphs[i] + paragraphs[i + 1]
            paragraphs.pop(i + 1)
        else:
            i += 1
    return paragraphs


paragraphs = add_paragraphs(paragraphs)

#Get embeddings of paragraphs and query
embeddings = [get_text_embedding(paragraph) for paragraph in paragraphs]
