In [1]:
from transformers import AutoTokenizer, AutoModel, pipeline
import torch

#for not seing a warning message
import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)



def get_text_embedding(text, model_name='bert-base-uncased'):
    # Load pre-trained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Tokenize input text and convert to PyTorch tensors
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

    # Get output from pre-trained model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract last layer of output (CLS token) as the text embedding
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()


    return embedding

In [2]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def get_similar_paragraphs(query_embedding, paragraph_embeddings, paragraphs, k=5):

    # Compute the cosine similarities between the query embedding and each paragraph embedding
    similarities = [cosine_similarity(query_embedding.reshape(1, -1), embedding.reshape(1, -1)) for embedding in paragraph_embeddings]

    # Get the indices of the top k paragraphs based on their similarity scores
    top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:k]

    # Return the top k paragraphs and their similarity scores as a list dictionaries
    return [{"content": paragraphs[i], "score" :similarities[i]} for i in top_indices]




In [3]:
query="Did the Greeks and the Romans have universities?"

In [5]:
#Open example text file
path="Documents/"
file_name="rise_of_universities.txt"

with open(path+file_name, "r",encoding='utf-8') as file:
    input_text = file.read()

#split into paragraphs
paragraphs = input_text.split("\n\n")

#Get embeddings of paragraphs and query
embeddings = [get_text_embedding(paragraph) for paragraph in paragraphs]
query_embedding = get_text_embedding(query)

#Get top k similar paragraphs and cosine distance score
similar_paragraphs = get_similar_paragraphs(query_embedding, embeddings,paragraphs, k=5)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [6]:
#print most similar paragraphs and score
top_answers=similar_paragraphs[0]["content"]
print(top_answers)

I
THE EARLIEST UNIVERSITIES
Universities, like cathedrals and parliaments, are a product of the Middle Ages. The Greeks and the Romans, strange as it may seem, had no universities in the sense in which the word has been used for the past seven or eight centuries. They had higher education, but the terms are not synonymous. Much of their instruction in law, rhetoric, and philosophy it would be hard to surpass, but it was not organized into the form of permanent institutions of learning. A great teacher like Socrates gave no diplomas; if a modern student sat at his feet for three months, he would demand a certificate,[4] something tangible and external to show for it—an excellent theme, by the way, for a Socratic dialogue. Only in the twelfth and thirteenth centuries do there emerge in the world those features of organized education with which we are most familiar, all that machinery of instruction represented by faculties and colleges and courses of study, examinations and commencements

In [14]:
from transformers import pipeline

In [15]:
oracle = pipeline(model="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2")
Bert_topK = oracle(query, top_answers, top_k=5)

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [16]:
Bert_topK

[{'score': 0.26762381196022034,
  'start': 109,
  'end': 179,
  'answer': 'The Greeks and the Romans, strange as it may seem, had no universities'},
 {'score': 0.2206006497144699,
  'start': 164,
  'end': 179,
  'answer': 'no universities'},
 {'score': 0.14598335325717926,
  'start': 160,
  'end': 179,
  'answer': 'had no universities'},
 {'score': 0.07087451964616776,
  'start': 109,
  'end': 166,
  'answer': 'The Greeks and the Romans, strange as it may seem, had no'},
 {'score': 0.05842142552137375, 'start': 164, 'end': 166, 'answer': 'no'}]

In [17]:
prompt=f"You are a friendly and helpful chatbot. your job is to give a short and relevant answer to a given question, based on a specific background context. The context is: '{top_answers}'. Please give a relevant answer to the following question. Question: {query}. Answer: {Bert_topK[0]['answer']}"

# prompt=f"You are a friendly and helpful chatbot. your job is to give a short and relevant answer to a given question. Please give a relevant answer to the following question. Question: {query}. Answer:"


In [19]:
#Feed the relevant sentences and query to a GPT model to generate a response
generator = pipeline("text-generation", model="gpt2-large")
generator(prompt, max_length=400, do_sample=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "You are a friendly and helpful chatbot. your job is to give a short and relevant answer to a given question, based on a specific background context. The context is: 'I\nTHE EARLIEST UNIVERSITIES\nUniversities, like cathedrals and parliaments, are a product of the Middle Ages. The Greeks and the Romans, strange as it may seem, had no universities in the sense in which the word has been used for the past seven or eight centuries. They had higher education, but the terms are not synonymous. Much of their instruction in law, rhetoric, and philosophy it would be hard to surpass, but it was not organized into the form of permanent institutions of learning. A great teacher like Socrates gave no diplomas; if a modern student sat at his feet for three months, he would demand a certificate,[4] something tangible and external to show for it—an excellent theme, by the way, for a Socratic dialogue. Only in the twelfth and thirteenth centuries do there emerge in the world those 