In [1]:
from langchain.document_loaders import PyPDFLoader
from sentence_transformers import SentenceTransformer
import faiss
import torch
import re
import nest_asyncio

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Function to split text into chunks of specified size with overlap
def split_text_with_overlap(text, chunk_size=450, overlap=50):
    words = text.split()
    chunks = []


    for i in range(0, len(words), chunk_size - overlap):
        chunk = words[i:i + chunk_size]
        chunks.append(" ".join(chunk))
    return chunks

In [3]:
nest_asyncio.apply()


pdf_path = "../data/GOT_synopsis.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()
full_text = " ".join([doc.page_content for doc in docs])
doc_texts = split_text_with_overlap(full_text, chunk_size=90, overlap=20)

In [4]:
with open("../data/doc_texts.txt", "w") as f:
    for chunk in doc_texts:
        f.write(chunk + "\n")

In [4]:

model = SentenceTransformer('all-mpnet-base-v2')
embeddings = model.encode(doc_texts)

In [5]:
model.save('../models/sentence_embedding_model')

In [6]:

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)  # Add vectors to the index

In [7]:
faiss.write_index(index, '../models/faiss_index_file.index')

In [8]:
#to import the saved index
index = faiss.read_index('../models/faiss_index_file.index')

In [8]:
sentence_embedding_model = SentenceTransformer('../models/sentence_embedding_model')

In [9]:
query = '''To whom did the Petyr Baelish gave the Valeryian steel dagger ?'''
query_vector = sentence_embedding_model.encode([query])
#D, I = index.search(query_vector, k=4)
#relevant_texts = [doc_texts[i] for i in I[0]]
#merged_text = " ".join(relevant_texts)
_, I = index.search(query_vector, k=4)
merged_text = " ".join([doc_texts[i] for i in I[0]])

In [10]:
print(merged_text)

a cash reward but wants more. Jaime orders Bronn to help the Tarly's round up food from the local farmers. The Iron Bank rep thanks Cersei for the gold repayment, on its way. He suggests future deals and their support. She plans on deals with Essos. Petyr Baelish sits with Bran and gives him the Valeryian steel dagger that kill ed his mother, but doesn't know who owned it. Baelish promises support but is stopped by a cryptic comment, "chaos is a ladder", Littlefinger had used that phrase in private up with a knife to his throat -- it's Petyr. "I did warn you not to trust me," Petyr says. S1, Ep8 The Pointy End • Arya practices her swordplay while real swordplay spills out of the throne room after Cersei and Joffrey tried to claim the throne. The fighting heads into the courtyard, where Cersei's men kill anyone who gets in their way. Sansa goes looking for her sist er, but her nursemaid hears the fighting and sends Sansa to her room with strict instructions not to open the a room full o

In [11]:

from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
model.save_pretrained('../models/Q&A_model')
tokenizer.save_pretrained('../models/Q&A_tokenizer')

('../models/Q&A_tokenizer\\tokenizer_config.json',
 '../models/Q&A_tokenizer\\special_tokens_map.json',
 '../models/Q&A_tokenizer\\vocab.txt',
 '../models/Q&A_tokenizer\\added_tokens.json')

In [13]:
#import the model
QA_model = BertForQuestionAnswering.from_pretrained('../models/Q&A_model')
QA_tokenizer = BertTokenizer.from_pretrained('../models/Q&A_tokenizer')

In [14]:
question = query

paragraph = merged_text
encoding = QA_tokenizer.encode_plus(text=question,text_pair=paragraph)

inputs = encoding['input_ids']  #Token embeddings
sentence_embedding = encoding['token_type_ids']  #Segment embeddings
tokens = QA_tokenizer.convert_ids_to_tokens(inputs) #input tokens

In [15]:
print(question)
print(paragraph)

To whom did the Petyr Baelish gave the Valeryian steel dagger ?
a cash reward but wants more. Jaime orders Bronn to help the Tarly's round up food from the local farmers. The Iron Bank rep thanks Cersei for the gold repayment, on its way. He suggests future deals and their support. She plans on deals with Essos. Petyr Baelish sits with Bran and gives him the Valeryian steel dagger that kill ed his mother, but doesn't know who owned it. Baelish promises support but is stopped by a cryptic comment, "chaos is a ladder", Littlefinger had used that phrase in private up with a knife to his throat -- it's Petyr. "I did warn you not to trust me," Petyr says. S1, Ep8 The Pointy End • Arya practices her swordplay while real swordplay spills out of the throne room after Cersei and Joffrey tried to claim the throne. The fighting heads into the courtyard, where Cersei's men kill anyone who gets in their way. Sansa goes looking for her sist er, but her nursemaid hears the fighting and sends Sansa to

In [16]:
start_scores = QA_model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))['start_logits']
end_scores = QA_model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))['end_logits']

In [17]:
start_index = torch.argmax(start_scores)

end_index = torch.argmax(end_scores)

answer = ' '.join(tokens[start_index:end_index+1])

In [18]:
print(answer)

bran
