# FINAL YEAR PROJECT

### IMPORTS

In [1]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts.chat import ChatPromptTemplate
from langchain.schema import BaseOutputParser
from langchain.chains import LLMChain, SequentialChain
from langchain_pinecone import PineconeVectorStore 
from langchain_community.document_loaders import DirectoryLoader 
from langchain_text_splitters import RecursiveCharacterTextSplitter 
from langchain_experimental.sql import SQLDatabaseChain
from langchain.sql_database import SQLDatabase
import os 
import glob
from pathlib import Path
from dotenv import load_dotenv
import os

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
# deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
# print(deepseek_api_key)


  from .autonotebook import tqdm as notebook_tqdm


### SELECT YOUR MODEL

In [2]:
# Initialize the OpenAI model via LangChain
# llm = ChatOpenAI(
#     model="gpt-4o-mini-2024-07-18",
#     api_key=openai_api_key
# )
llm = ChatOpenAI(
    model="o1",
    api_key=openai_api_key
)

#select embeddings model
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

#connect to sqlite database
database = SQLDatabase.from_uri("sqlite:///database/tirth.db")

index_name = "mark-3"

# llm = ChatOpenAI(
#     model="deepseek-chat",
#     api_key=deepseek_api_key
# )




### Loading Documents

In [None]:
#load the documents from the directory
loader = DirectoryLoader('source',glob="**/*.pdf")
docs = loader.load()
docs[0]

### Chunking Documents

In [None]:
#chunking the documents into smaller pieces
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1400,chunk_overlap=200)
split_docs = text_splitter.split_documents(docs)
len(split_docs[1].page_content)
len(split_docs)
#upload the documents to pinecone
vectorstore = PineconeVectorStore.from_documents(split_docs, embeddings, index_name=index_name)

48

### Initialize retriver

In [None]:
#initialize the Pinecone vector store
vectorstore = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
)

## HELPER FUNCTIONS
#### RETRIVER 

In [21]:
#retrieve the documents from the vector store
def get_context(query):
    similar_docs = vectorstore.similarity_search(query, k=3)
    context = [' '.join(doc.page_content.split()) for doc in similar_docs]
    return context


### Query for Retriving Context

In [22]:
user_input = """
How many customers have spent more than 500 ?
"""
# user_input = """
# How many power modes do i have in my laptop?
# """

retrival_template = f"""Rewrite the following query so that it can be used as an input prompt in a retriver of a Vector DB. The new query should:
- DO NOT ANSWER the original query; only rewrite it.

Return ONLY the rewritten query text without additional formatting or explanations.

Original query: 

"""

# Create ChatPromptTemplate for the rewriting chain
retrival_prompt = ChatPromptTemplate.from_messages([
    ("system", retrival_template),
    ("human", "{text}")
])

# Now wrap each prompt into an LLMChain
retrival_chain = LLMChain(
    llm=llm,
    prompt=retrival_prompt,
    output_key="retrival_query"
)

result = retrival_chain.invoke({"text":user_input})
print( "retrival_prompt_Question : ", result["retrival_query"])

retrived_context = get_context(result["retrival_query"])
print("retrived Context : ")
[print(x) for x in retrived_context]


retrival_prompt_Question :  Number of customers who spent more than 500
retrived Context : 
© Copyright Lenovo 2023 29 Any performance data contained herein was determined in a controlled environment. Therefore, the result obtained in other operating environments may vary significantly. Some measurements may have been made on development-level systems and there is no guarantee that these measurements will be the same on generally available systems. Furthermore, some measurements may have been estimated through extrapolation. Actual results may vary. Users of this document should verify the applicable data for their specific environment. This document is copyrighted by Lenovo and is not covered by any open source license, including any Linux agreement(s) which may accompany software included with this product. Lenovo may update this document at any time without notice. For the latest information or any questions or comments, contact or visit the Lenovo Web site: https://support.lenovo.c

[None, None, None]

### Rewrite Query for Final Question

In [23]:

rewrite_template = f"""Rewrite the following query so that it can be used as an input prompt in a RAG system. The new query should:
- Preserve the core intent and meaning of the original query.
- Try to Question the same Questions in different manner 
- Break down the query into steps that lead to an answer.
- Expand and clarify the query to be more specific and informative for retrieving relevant context.
- Avoid introducing new topics or deviating from the original query.
- DO NOT ANSWER the original query; only rewrite it.

Return ONLY the rewritten query text without additional formatting or explanations.

Original query: 

"""

# Create ChatPromptTemplate for the rewriting chain
rewrite_prompt = ChatPromptTemplate.from_messages([
    ("system", rewrite_template),
    ("human", "{text}")
])

# Now wrap each prompt into an LLMChain
rewrite_chain = LLMChain(
    llm=llm,
    prompt=rewrite_prompt,
    output_key="rewritten_query"
)

result = rewrite_chain.invoke({"text":user_input})
final_prompt_question = result["rewritten_query"]
print("final_prompt_question : ",final_prompt_question)

final_prompt_question :  Which dataset contains customer purchase information, and how can we:
1. Calculate each customer’s total spending,  
2. Filter to show only those whose spending exceeds 500,  
3. Count the total number of such customers,  
so we can identify how many customers have spent over 500?


In [24]:
system_prompt = f"""You are a helpful assistant that is an expert at extracting the most useful information from a given text.
Also bring in extra relevant information to the user query from outside the given context only if the given information is not enough.
And if you solve the question by using the extra relevant information that you brought, please do wrap it in <suggested> </suggested> tag.

Context: {retrived_context}
"""


# Create ChatPromptTemplate for the main answering chain
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{text}")
])


answer_chain = LLMChain(
    llm=llm,
    prompt=chat_prompt,
    output_key="answer"
)



### Sqlite3


In [25]:
from langchain_experimental.sql import SQLDatabaseChain

# user_query = user_query.strip() + " In tabular format"
from langchain.agents import create_sql_agent
from langchain.agents.agent_toolkits import SQLDatabaseToolkit

db_chain = SQLDatabaseChain.from_llm(llm, database, verbose=True)


toolkit = SQLDatabaseToolkit(db=database,llm=llm)

sqlagent = create_sql_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True,
)


In [26]:
vector_result = answer_chain.invoke({"text":final_prompt_question})
vector_result=vector_result["answer"]
try:
    sql_result = sqlagent.run(user_input)
except Exception as e:
    sql_result = "No result found in SQL database"

final_input=f"""
Information from Vector data base : {vector_result}
Information from SQL data base : {sql_result}
"""

final_process_template = f"""
You are a helpful assistant that is an expert at extracting the most useful information from a given text.
You shall be given information from two databases , structured databases and unstructured databases.
You have to combine the information from both databases and give the final answer to the user query.
The final answer should be concise and relevant to the user query.
- DO not make up the answers or add any information that is not present in the given context.
- Answer in brief and clear manner.
"""

print(final_input)

# Create ChatPromptTemplate for the rewriting chain
final_process_prompt = ChatPromptTemplate.from_messages([
    ("system", final_process_template),
    ("human", "{text}")
])

# Now wrap each prompt into an LLMChain
final_process_chain = LLMChain(
    llm=llm,
    prompt=final_process_prompt,
    output_key="final_answer"
)

result = final_process_chain.invoke({"text":final_input})
final_prompt_question = result["final_answer"]
print("final_prompt_question : ",final_prompt_question)




[1m> Entering new SQL Agent Executor chain...[0m
[32;1m[1;3mQuestion: How many customers have spent more than 500 ?

Thought: I should find out which tables exist in the database first.

Action: sql_db_list_tables
Action Input:  (empty string)[0m[38;5;200m[1;3mcategories, customers, employees, order_items, orders, payments, product_categories, products, shipping[0m[32;1m[1;3mQuestion: How many customers have spent more than 500 ?

Thought: Let's check the schema of the "payments" table to confirm how we can sum their spending.

Action: sql_db_schema
Action Input: payments[0m[33;1m[1;3m
CREATE TABLE payments (
	payment_id INTEGER, 
	order_id INTEGER NOT NULL, 
	amount REAL NOT NULL, 
	payment_date TEXT NOT NULL, 
	payment_method TEXT, 
	PRIMARY KEY (payment_id), 
	FOREIGN KEY(order_id) REFERENCES orders (order_id) ON DELETE CASCADE
)

/*
3 rows from payments table:
payment_id	order_id	amount	payment_date	payment_method
1	1	1240.0	2025-04-01	Credit Card
2	2	845.0	2025-04-0

# quick fix

### history of chat 
### get reading from the reviews.
### suggests improvements from the reviews 



### turn it into a AGENT

2.3
3.2
5.1
5.1
5.1
