In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

#for not seing a warning message
import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)



def get_text_embedding(text, model_name='bert-base-uncased'):
    # Load pre-trained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Tokenize input text and convert to PyTorch tensors
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

    # Get output from pre-trained model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract last layer of output (CLS token) as the text embedding
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()


    return embedding

In [2]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def get_similar_paragraphs(query_embedding, paragraph_embeddings, paragraphs, k=5):

    # Compute the cosine similarities between the query embedding and each paragraph embedding
    similarities = [cosine_similarity(query_embedding.reshape(1, -1), embedding.reshape(1, -1)) for embedding in paragraph_embeddings]

    # Get the indices of the top k paragraphs based on their similarity scores
    top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:k]

    # Return the top k paragraphs and their similarity scores as a list dictionaries
    return [{"content": paragraphs[i], "score" :similarities[i]} for i in top_indices]




In [3]:
#Open example text file
path="C:/Users/farim/PycharmProjects/ChatGPT_Tutor_Project/Documents/"
file_name="text_example.txt"

with open(path+file_name, "r",encoding='utf-8') as file:
    input_text = file.read()

#split into paragraphs
paragraphs = input_text.split("\n\n")

#Get embeddings of paragraphs and query
embeddings = [get_text_embedding(paragraph) for paragraph in paragraphs]
query_embedding = get_text_embedding("Did the Greeks and the Romans have universities?")

#Get top k similar paragraphs and cosine distance score
similar_paragraphs = get_similar_paragraphs(query_embedding, embeddings,paragraphs, k=5)


106
5


In [7]:
#print most similar paragraphs and score
print(similar_paragraphs[0])

{'content': 'THE EARLIEST UNIVERSITIES\nUniversities, like cathedrals and parliaments, are a product of the Middle Ages. The Greeks and the Romans, strange as it may seem, had no universities in the sense in which the word has been used for the past seven or eight centuries. They had higher education, but the terms are not synonymous. Much of their instruction in law, rhetoric, and philosophy it would be hard to surpass, but it was not organized into the form of permanent institutions of learning. A great teacher like Socrates gave no diplomas; if a modern student sat at his feet for three months, he would demand a certificate,[4] something tangible and external to show for it—an excellent theme, by the way, for a Socratic dialogue. Only in the twelfth and thirteenth centuries do there emerge in the world those features of organized education with which we are most familiar, all that machinery of instruction represented by faculties and colleges and courses of study, examinations and c