In [10]:
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
import torch._utils

#for not seing a warning message
import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)


def get_text_embedding(text, model_name='bert-base-uncased'):
    # Load pre-trained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Tokenize input text and convert to PyTorch tensors
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

    # Get output from pre-trained model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract last layer of output (CLS token) as the text embedding
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()


    return embedding

In [11]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def get_similar_paragraphs(query_embedding, paragraph_embeddings, paragraphs, k=5):

    # Compute the cosine similarities between the query embedding and each paragraph embedding
    similarities = [cosine_similarity(query_embedding.reshape(1, -1), embedding.reshape(1, -1)) for embedding in paragraph_embeddings]

    # Get the indices of the top k paragraphs based on their similarity scores
    top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:k]

    # Return the top k paragraphs and their similarity scores as a list dictionaries
    return [{"content": paragraphs[i], "score" :similarities[i]} for i in top_indices]




In [12]:
query="Did the Greeks and the Romans have universities?"

In [13]:
#Open example text file
path="Documents/"
file_name="rise_of_universities.txt"

with open(path+file_name, "r",encoding='utf-8') as file:
    input_text = file.read()

#split into paragraphs
paragraphs = input_text.split("\n\n")

#Get embeddings of paragraphs and query
embeddings = [get_text_embedding(paragraph) for paragraph in paragraphs]
query_embedding = get_text_embedding(query)

#Get top k similar paragraphs and cosine distance score
similar_paragraphs = get_similar_paragraphs(query_embedding, embeddings,paragraphs, k=5)


In [None]:
path = 'Documents/02450_w_form.txt'
with open(path, "r", encoding='utf-8') as file:
    input_text = file.read()

#split into paragraphs
paragraphs = input_text.split("\n\n")
paragraphs = [p for p in paragraphs if len(p) > 100]
#Remove \n and \t and -  from paragraphs:
paragraphs = [p.replace("\n", " ") for p in paragraphs]
paragraphs = [p.replace("\t", " ") for p in paragraphs]
paragraphs = [p.replace("- ", " ") for p in paragraphs]
#Get embeddings of paragraphs and query
embeddings = [get_text_embedding(paragraph) for paragraph in paragraphs]


In [None]:
import pickle
with open("emb_02450", "wb") as fp:   #Pickling
pickle.dump(embeddings, fp)

In [None]:
import pickle
with open("emb_02450", "rb") as fp:
embeddings_02450 = pickle.load(fp)

In [None]:
query_embedding = get_text_embedding(query)

#Get top k similar paragraphs and cosine distance score
similar_paragraphs = get_similar_paragraphs(query_embedding, embeddings,paragraphs, k=5)

In [14]:
#print most similar paragraphs and score
top_answers=similar_paragraphs[0]["content"]
print(top_answers)

I
THE EARLIEST UNIVERSITIES
Universities, like cathedrals and parliaments, are a product of the Middle Ages. The Greeks and the Romans, strange as it may seem, had no universities in the sense in which the word has been used for the past seven or eight centuries. They had higher education, but the terms are not synonymous. Much of their instruction in law, rhetoric, and philosophy it would be hard to surpass, but it was not organized into the form of permanent institutions of learning. A great teacher like Socrates gave no diplomas; if a modern student sat at his feet for three months, he would demand a certificate,[4] something tangible and external to show for it—an excellent theme, by the way, for a Socratic dialogue. Only in the twelfth and thirteenth centuries do there emerge in the world those features of organized education with which we are most familiar, all that machinery of instruction represented by faculties and colleges and courses of study, examinations and commencements

In [15]:
from transformers import pipeline

In [16]:
oracle = pipeline(model="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2")
Bert_topK = oracle(query, top_answers, top_k=5)

In [17]:
Bert_topK

[{'score': 0.26762357354164124,
  'start': 109,
  'end': 179,
  'answer': 'The Greeks and the Romans, strange as it may seem, had no universities'},
 {'score': 0.2206004559993744,
  'start': 164,
  'end': 179,
  'answer': 'no universities'},
 {'score': 0.14598329365253448,
  'start': 160,
  'end': 179,
  'answer': 'had no universities'},
 {'score': 0.07087451219558716,
  'start': 109,
  'end': 166,
  'answer': 'The Greeks and the Romans, strange as it may seem, had no'},
 {'score': 0.05842142552137375, 'start': 164, 'end': 166, 'answer': 'no'}]

In [18]:
prompt=f"You are a friendly and helpful chatbot. your job is to give a short and relevant answer to a given question, based on a specific background context. The context is: '{top_answers}'. Please give a relevant answer to the following question. Question: {query}. Answer: {Bert_topK[0]['answer']}"

# prompt=f"You are a friendly and helpful chatbot. your job is to give a short and relevant answer to a given question. Please give a relevant answer to the following question. Question: {query}. Answer:"


In [19]:
#Feed the relevant sentences and query to a GPT model to generate a response
generator = pipeline("text-generation", model="gpt2-large")
generator(prompt, max_length=400, do_sample=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "You are a friendly and helpful chatbot. your job is to give a short and relevant answer to a given question, based on a specific background context. The context is: 'I\nTHE EARLIEST UNIVERSITIES\nUniversities, like cathedrals and parliaments, are a product of the Middle Ages. The Greeks and the Romans, strange as it may seem, had no universities in the sense in which the word has been used for the past seven or eight centuries. They had higher education, but the terms are not synonymous. Much of their instruction in law, rhetoric, and philosophy it would be hard to surpass, but it was not organized into the form of permanent institutions of learning. A great teacher like Socrates gave no diplomas; if a modern student sat at his feet for three months, he would demand a certificate,[4] something tangible and external to show for it—an excellent theme, by the way, for a Socratic dialogue. Only in the twelfth and thirteenth centuries do there emerge in the world those 

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!
