#### **Load the document to which you want to train the model:**

In [33]:
import pymupdf  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# Function to extract text from the PDF using pymupdf
def extract_text_from_pdf(pdf_path):
    doc = pymupdf.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Function to create a Document object for langchain
def create_document(text):
    return Document(page_content=text)

# Define a function to split the text into chunks using langchain's RecursiveCharacterTextSplitter
def chunk_text(text, chunk_size=1000, chunk_overlap=150):
    # Create an instance of RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    
    # Create a Document object as expected by the text splitter
    doc = create_document(text)
    
    # Split the document into chunks
    return text_splitter.split_documents([doc])

# Specify the path to your PDF file
pdf_path = r'C:\Users\nilay\OneDrive - Cal State Fullerton (1)\Desktop\NILAY-TO-JOB-DATA\SPRING 2024\Projects\Multi-Purpose_AI_ChatBot\documents\harrypotter.pdf'

# Extract text from the PDF
text = extract_text_from_pdf(pdf_path)

# Split the extracted text into chunks
chunks = chunk_text(text)

# Iterate through the chunks and print or process them
for chunk in chunks:
    print(f"{chunk.page_content}\n\n")

# Extract the text content from each chunk in 'chunks'
chunk_texts = [chunk.page_content for chunk in chunks]

CONTENTS
Harry Potter and the Sorcerer’s Stone
Harry Potter and the Chamber of Secrets
Harry Potter and the Prisoner of Azkaban
Harry Potter and the Goblet of Fire
Harry Potter and the Order of the Phoenix
Harry Potter and the Half-Blood Prince
Harry Potter and the Deathly Hallows
 
FOR JESSICA, WHO LOVES STORIES,
FOR ANNE, WHO LOVED THEM TOO;
AND FOR DI, WHO HEARD THIS ONE FIRST.
 
CONTENTS
ONE
The Boy Who Lived
TWO
The Vanishing Glass
THREE
The Letters from No One
FOUR
The Keeper of the Keys
FIVE
Diagon Alley
SIX
The Journey from Platform Nine and Three-quarters
SEVEN
The Sorting Hat
EIGHT
The Potions Master
NINE
The Midnight Duel
TEN
Halloween
ELEVEN
Quidditch
TWELVE
The Mirror of Erised
THIRTEEN
Nicolas Flamel
FOURTEEN
Norbert the Norwegian Ridgeback
FIFTEEN
The Forbidden Forest
SIXTEEN
Through the Trapdoor
SEVENTEEN
The Man with Two Faces
M
 
CHAPTER  ONE
THE BOY WHO LIVED
r. and Mrs. Dursley, of number four, Privet Drive, were proud to say


SEVENTEEN
The Man with Two Faces
M
 
C

#### **Using 'sentence-transformers/all-MiniLM-L6-v2' for generating embedding for the text:**

In [34]:
from sentence_transformers import SentenceTransformer

# Load the SentenceTransformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



In [35]:
# Encode the text chunks into embeddings
embeddings_text = model.encode(chunk_texts)

# Print the resulting embeddings
print(embeddings_text)

[[-0.03331053  0.01512659  0.02184572 ...  0.07222768 -0.03463246
  -0.02149419]
 [-0.04536013  0.02431055  0.00590644 ...  0.0378813  -0.00503044
  -0.05739693]
 [-0.08069363  0.02635693  0.05996142 ...  0.08673899 -0.02785908
  -0.04810682]
 ...
 [-0.03441347  0.00865177  0.05329141 ...  0.01534658 -0.04612155
   0.00062893]
 [-0.06121116  0.02986909  0.02904147 ... -0.0006145  -0.0118852
   0.10015128]
 [ 0.04530032  0.02232942 -0.00101617 ... -0.02057693 -0.04554052
   0.01780267]]


In [None]:
type(embeddings_text)

#### **Connecting to Pgvector DB:**

In [36]:
import psycopg2

# Connection parameters
db_name = "postgres"
user = "postgres"
password = "admin"
host = "localhost"  # or the IP address of the PostgreSQL server
port = "5432"       # default PostgreSQL port

# Connect to the PostgreSQL database
conn = psycopg2.connect(
    dbname=db_name,
    user=user,
    password=password,
    host=host,
    port=port
)

print('Status = ', conn)

# Create a cursor object
cur = conn.cursor()

Status =  <connection object at 0x000001F419DBFBC0; dsn: 'user=postgres password=xxx dbname=postgres host=localhost port=5432', closed: 0>


##### **Creating the table in Pgvector DB:**

In [37]:
# SQL query to create the table
create_table_query = """
CREATE TABLE HP_part_1 (
    id SERIAL PRIMARY KEY,
    document TEXT,
    embedding VECTOR(384)
)
"""

# Execute the SQL query
cur.execute(create_table_query)

# Commit the transaction to make the changes persistent
conn.commit()

print("Table created successfully.")

Table created successfully.


##### **Dropping the table in Pgvector DB:**

In [32]:
# SQL query to drop the table
drop_table_query = """
DROP TABLE IF EXISTS HP_part_1;
"""

# Execute the SQL query
cur.execute(drop_table_query)

# Commit the transaction to make the changes persistent
conn.commit()

print("Table dropped successfully.")

Table dropped successfully.


##### **Store the embeddings in the Table:**

In [38]:
# SQL query to insert data into the table
insert_query = """
INSERT INTO HP_part_1 (document, embedding)
VALUES (%s, %s::vector)
"""

# Insert each document and its corresponding embedding
for doc_text, embedding in zip(chunk_texts, embeddings_text):
    cur.execute(insert_query, (doc_text, embedding.tolist()))

# Commit the transaction
conn.commit()

print("Data inserted successfully.")

Data inserted successfully.


##### **Taking input from user & converting it to embedding and querying the database:**

In [48]:
# User Query Input
# user_query = input("Enter your query: ")
user_query = 'Who killed Professor Albus Dumbledore?'

# Generate embedding for the user's query
query_embedding = model.encode([user_query])[0]  # Convert to vector

# SQL query to perform similarity search and fetch top 2 results
similarity_query = """
SELECT id, document, embedding
FROM HP_part_1
ORDER BY embedding <-> %s::vector
LIMIT 1;
"""

# Execute the similarity search query
cur.execute(similarity_query, (query_embedding.tolist(),))

# Fetch and display the 2 most relevant documents
results = cur.fetchall()
context = results[0][1]
print(context)

exactly did Aberforth Dumbledore blame Albus for his sister’s death?
Was it, as “Batty” pretends, a mere effusion of grief? Or could there have
been some more concrete reason for his fury? Grindelwald, expelled
from Durmstrang for near-fatal attacks upon fellow students, fled the
country hours after the girl’s death, and Albus (out of shame or fear?)
never saw him again, not until forced to do so by the pleas of the
Wizarding world.
Neither Dumbledore nor Grindelwald ever seems to have referred to
this brief boyhood friendship in later life. However, there can be no
doubt that Dumbledore delayed, for some five years of turmoil, fatalities,
and disappearances, his attack upon Gellert Grindelwald. Was it
lingering affection for the man or fear of exposure as his once best friend
that caused Dumbledore to hesitate? Was it only reluctantly that
Dumbledore set out to capture the man he was once so delighted he had
met?
And how did the mysterious Ariana die? Was she the inadvertent


##### **Using text generative model to generate response based on context and query:**

In [18]:
type(context)

str

In [42]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the distilgpt2 tokenizer and model
tokenizer_dgpt = AutoTokenizer.from_pretrained("distilgpt2")
model_dgpt = AutoModelForCausalLM.from_pretrained("distilgpt2")



In [49]:
# Combine context and question
prompt = f"You are a harry potter book query responder. These Human will ask you a questions about anything related to harry potter novels. Use following piece of context to answer the question. If you don't know the answer, just say you don't know. Keep the answer within 2 sentences and concise. \n\nContext: {context} \n\nQuestion: {user_query} \n\nAnswer: "
print(prompt)

You are a harry potter book query responder. These Human will ask you a questions about anything related to harry potter novels. Use following piece of context to answer the question. If you don't know the answer, just say you don't know. Keep the answer within 2 sentences and concise. 

Context: exactly did Aberforth Dumbledore blame Albus for his sister’s death?
Was it, as “Batty” pretends, a mere effusion of grief? Or could there have
been some more concrete reason for his fury? Grindelwald, expelled
from Durmstrang for near-fatal attacks upon fellow students, fled the
country hours after the girl’s death, and Albus (out of shame or fear?)
never saw him again, not until forced to do so by the pleas of the
Wizarding world.
Neither Dumbledore nor Grindelwald ever seems to have referred to
this brief boyhood friendship in later life. However, there can be no
doubt that Dumbledore delayed, for some five years of turmoil, fatalities,
and disappearances, his attack upon Gellert Grindelwal

In [46]:
len(prompt)

1351

In [50]:
# Tokenize the prompt
inputs = tokenizer_dgpt(prompt, return_tensors="pt")

# Generate a response
outputs = model_dgpt.generate(
    inputs.input_ids,
    max_length=inputs.input_ids.shape[1] + 60,  # Limit the response length
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    early_stopping=True
)

# Decode the generated response
response = tokenizer_dgpt.decode(outputs[0], skip_special_tokens=True)

# Extract the answer after "A:"
print('\n\n')
response = response.split("Answer: ")[1].strip()

print(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.





The answer is a simple question:
Albus, who was the first to die in the wizarding
world? Did he have any knowledge of Albino‬s fate?  Was he the only one who knew him?  
What was Dumbledore‭s motive for
