In [141]:
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
import torch._utils
from transformers import pipeline

#for not seing a warning message
import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)


def get_text_embedding(text, model_name='bert-base-uncased'):
    # Load pre-trained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Tokenize input text and convert to PyTorch tensors
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

    # Get output from pre-trained model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract last layer of output (CLS token) as the text embedding
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()


    return embedding

In [142]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def get_similar_paragraphs(query_embedding, paragraph_embeddings, paragraphs, k=5):

    # Compute the cosine similarities between the query embedding and each paragraph embedding
    similarities = [cosine_similarity(query_embedding.reshape(1, -1), embedding.reshape(1, -1)) for embedding in paragraph_embeddings]

    # Get the indices of the top k paragraphs based on their similarity scores
    top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:k]

    # Return the top k paragraphs and their similarity scores as a list dictionaries
    return [{"content": paragraphs[i], "score" :similarities[i]} for i in top_indices]




In [143]:
query="Did the Greeks and the Romans have universities?"

In [144]:
#Open example text file
path="Documents/"
file_name="rise_of_universities.txt"

with open(path+file_name, "r",encoding='utf-8') as file:
    input_text = file.read()

#split into paragraphs
paragraphs = input_text.split("\n\n")

#Get embeddings of paragraphs and query
embeddings = [get_text_embedding(paragraph) for paragraph in paragraphs]
query_embedding = get_text_embedding(query)

#Get top k similar paragraphs and cosine distance score
similar_paragraphs = get_similar_paragraphs(query_embedding, embeddings,paragraphs, k=5)


KeyboardInterrupt: 

In [146]:
path = 'Documents/02450_w_form.txt'
with open(path, "r", encoding='utf-8') as file:
    input_text = file.read()

#split into paragraphs
paragraphs = input_text.split("\n\n")
paragraphs = [p for p in paragraphs if len(p) > 50]
#Remove \n and \t and -  from paragraphs:
paragraphs = [p.replace("\n", " ") for p in paragraphs]
paragraphs = [p.replace("\t", " ") for p in paragraphs]
paragraphs = [p.replace("- ", " ") for p in paragraphs]

# keep adding paragraphs together so they each have a minimum length of 700 characters
def add_paragraphs(paragraphs):
    i = 0
    while i < len(paragraphs) - 1:
        if len(paragraphs[i]) < 700:
            paragraphs[i] = paragraphs[i] + paragraphs[i + 1]
            paragraphs.pop(i + 1)
        else:
            i += 1
    return paragraphs


paragraphs = add_paragraphs(paragraphs)

#Get embeddings of paragraphs and query
embeddings = [get_text_embedding(paragraph) for paragraph in paragraphs]


In [147]:
import pickle

In [148]:

# with open("emb_02450", "wb") as fp:   #Pickling
#     pickle.dump(embeddings, fp)

In [149]:
with open("emb_02450", "rb") as fp:
    embeddings_02450 = pickle.load(fp)

In [151]:
query ="How does the Expectation maximization algorithm works?"

In [152]:
query_embedding = get_text_embedding(query)

#Get top k similar paragraphs and cosine distance score
similar_paragraphs = get_similar_paragraphs(query_embedding, embeddings_02450,paragraphs, k=5)

In [154]:
#The top 5 paragraphs
top_answers= similar_paragraphs[0]['content']
print(top_answers)

where  λ > 0  is  the  regularization  term.  This  difficulty  increases  with  poor   initialization  and  it  is therefore  recommended  to  initialize  the  EM  algorithm  to  the   output  of  the  K-means  clustering algorithm. Third, the EM algorithm in its present form  requires parameters;  for  high-dimensional  datasets  the  number  K(M  + 1)M/2  can  be  brought  down  by  considering  a  diagonal  covariance  matrix  to  KM .  There  is  however  also  goods  news  with   regards to  the  EM  algorithm  for  GMMs.  Asides  accomplishing  the  primary  objective,  a   general  density estimator which can be fitted efficiently, an advantage of the EM algorithm over   K-means is that one can select K  using cross-validation.


In [156]:
oracle = pipeline(model="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2")
Bert_topK = oracle(query, top_answers, top_k=5)

In [158]:
Bert_topK

[{'score': 0.017856039106845856,
  'start': 728,
  'end': 744,
  'answer': 'cross-validation'},
 {'score': 0.010887210257351398,
  'start': 704,
  'end': 744,
  'answer': 'one can select K  using cross-validation'},
 {'score': 0.010439837351441383,
  'start': 722,
  'end': 744,
  'answer': 'using cross-validation'},
 {'score': 0.0030739835929125547,
  'start': 712,
  'end': 744,
  'answer': 'select K  using cross-validation'},
 {'score': 0.002891442272812128,
  'start': 708,
  'end': 744,
  'answer': 'can select K  using cross-validation'}]

In [159]:
prompt=f"You are a friendly and helpful chatbot. your job is to give a short and relevant answer to a given question, based on a specific background context. The context is: '{top_answers}'. Please give a relevant answer to the following question. Question: {query}. Answer: {Bert_topK[0]['answer']}"

# prompt=f"You are a friendly and helpful chatbot. your job is to give a short and relevant answer to a given question. Please give a relevant answer to the following question. Question: {query}. Answer:"


In [160]:
#Feed the relevant sentences and query to a GPT model to generate a response
generator = pipeline("text-generation", model="gpt2-large")
generator(prompt, max_length=400, do_sample=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'You are a friendly and helpful chatbot. your job is to give a short and relevant answer to a given question, based on a specific background context. The context is: \'where  λ > 0  is  the  regularization  term.  This  difficulty  increases  with  poor   initialization  and  it  is therefore  recommended  to  initialize  the  EM  algorithm  to  the   output  of  the  K-means  clustering algorithm. Third, the EM algorithm in its present form  requires parameters;  for  high-dimensional  datasets  the  number  K(M  + 1)M/2  can  be  brought  down  by  considering  a  diagonal  covariance  matrix  to  KM .  There  is  however  also  goods  news  with   regards to  the  EM  algorithm  for  GMMs.  Asides  accomplishing  the  primary  objective,  a   general  density estimator which can be fitted efficiently, an advantage of the EM algorithm over   K-means is that one can select K  using cross-validation.\'. Please give a relevant answer to the following question. Questi