In [1]:
import pandas as pd
import numpy as pd
import os

from pathlib import Path

In [2]:
print(list(Path(".").glob("*.pdf")))

[PosixPath('Empirical Model for ratio of Conductivity.pdf'), PosixPath('nature12952.pdf'), PosixPath('p7.pdf'), PosixPath('nature05180.pdf'), PosixPath('p6.pdf'), PosixPath('p4.pdf'), PosixPath('nature09211.pdf'), PosixPath('effective-medium-network.pdf'), PosixPath('p2.pdf'), PosixPath('p1.pdf'), PosixPath('p5.pdf'), PosixPath('Thesis.pdf'), PosixPath('nmat3687.pdf'), PosixPath('p8.pdf'), PosixPath('nature17151.pdf'), PosixPath('Gabbett et al sub to nat mat.pdf'), PosixPath('1806-1117-rbef-39-01-e1303.pdf'), PosixPath('nature13831.pdf'), PosixPath('shardul_mukim_thesis.pdf'), PosixPath('p3.pdf')]


In [3]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [5]:
from transformers import AutoTokenizer
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import ChatOllama

In [6]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings

In [7]:
import json 
import pickle
from langchain_chroma import Chroma
from langchain_community.vectorstores import FAISS

In [8]:
import sqlite3

In [9]:
def ingest_pdf():
    chunks = []
    chunks_dir = {}
    pdf_list = list(Path(".").rglob("*.pdf"))
    print(pdf_list)

#    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")



    splitter = SemanticChunker(
        embeddings,
        breakpoint_threshold_type = "percentile",
        breakpoint_threshold_amount= 95
        )
    
    for file in pdf_list:
        print(f"{file} processing!!")
        loader = PyPDFLoader(file)
        doc = loader.load()
        bits_an_p = splitter.split_documents(doc)
        chunks_dir[str(file)] = bits_an_p
        chunks.extend(bits_an_p)

    with open("chunks_data.pkl" , 'wb') as f:
        pickle.dump(chunks_dir, f)

    print(f"Pickle file saved!!")


    chroma_path = "chroma_db"
    db_path = "rag_sparse.db"



    all_chunks = []

    for file_path, file_chunks in chunks_dir.items():
        all_chunks.extend(file_chunks)

        


    db = Chroma.from_documents(
        documents = chunks,
        embedding = embeddings,
        persist_directory = "chroma_db",
        collection_name="rag_collection"
            )


    print(f"Vector database created!!")




    return chunks


chunks_2 = ingest_pdf()

[PosixPath('Empirical Model for ratio of Conductivity.pdf'), PosixPath('nature12952.pdf'), PosixPath('p7.pdf'), PosixPath('nature05180.pdf'), PosixPath('p6.pdf'), PosixPath('p4.pdf'), PosixPath('nature09211.pdf'), PosixPath('effective-medium-network.pdf'), PosixPath('p2.pdf'), PosixPath('p1.pdf'), PosixPath('p5.pdf'), PosixPath('Thesis.pdf'), PosixPath('nmat3687.pdf'), PosixPath('p8.pdf'), PosixPath('nature17151.pdf'), PosixPath('Gabbett et al sub to nat mat.pdf'), PosixPath('1806-1117-rbef-39-01-e1303.pdf'), PosixPath('nature13831.pdf'), PosixPath('shardul_mukim_thesis.pdf'), PosixPath('p3.pdf')]
Empirical Model for ratio of Conductivity.pdf processing!!
nature12952.pdf processing!!
p7.pdf processing!!
nature05180.pdf processing!!
p6.pdf processing!!
p4.pdf processing!!
nature09211.pdf processing!!
effective-medium-network.pdf processing!!
p2.pdf processing!!
p1.pdf processing!!
p5.pdf processing!!
Thesis.pdf processing!!
nmat3687.pdf processing!!
p8.pdf processing!!
nature17151.pdf p

In [10]:
def load_vector_database():
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    chroma_db = Chroma(
        persist_directory= "chroma_db",
        embedding_function= embeddings,
        collection_name="rag_collection"
    )


    return chroma_db

In [11]:
db = load_vector_database()
os.environ['TRANSFORMERS_OFFLINE'] = '1'
os.environ['HF_HUB_OFFLINE'] = '1'



query = "what is the inverse problem?"
result = db.similarity_search(query, k = 10)

print(result[0].page_content)

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 899c2777-d65b-4def-8e20-4c23ca9a12d4)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


8 introduction
1.4 layout of thesis
The work presented in this thesis is focused on defining the inverse problem which
revolves around the questions posed in the earlier section. This work comprises 3
major projects constituting 5 chapters.


In [12]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [13]:
template = """

You are a technical assistant. Use provided context to answer questions correctly. 

If you do not know, say I dont know. 

context : {context},

question : {question}


answer: 
"""

In [14]:
prompt_template = ChatPromptTemplate.from_template(template)

In [15]:
retriver = db.as_retriever(search_kwargs = {"k" :5})
llm = ChatOllama(model="qwen2.5:7b", temperature=0)

In [16]:
rag_chain = (
    {"context" : retriver, "question" : RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()
)

In [17]:
responmse = rag_chain.invoke("what is the misfit function?")

In [18]:
query = "what are Graphene-based nanostructures?"

docs = db.similarity_search_with_score(query, k= 5)  # or retriver.invoke(query) depending on LC version


formatted_prompt = prompt_template.format(
    context="\n\n".join(
    doc.page_content for (doc, score) in docs
)
,
    question=query
)

print("===== FINAL PROMPT SENT TO LLM =====")
print(formatted_prompt)





#for i, (d, score) in enumerate(docs):
#    print(f"\n--- doc {i} ---.... score == {score}")
##    print(d.page_content[:500])
#    print("metadata:", d.metadata)

response = rag_chain.invoke(query)
print("\n\n\n\n\n\n\n\n\nAnswer:\n", response)


===== FINAL PROMPT SENT TO LLM =====
Human: 

You are a technical assistant. Use provided context to answer questions correctly. 

If you do not know, say I dont know. 

context : and manipulate quantum information [17, 18]. The valley degree of freedom can
also be used to create new types of electronic and optoelectronic devices [16], such
as valley transistors and valley-based sensors [19]. To exploit these exotic flavours
of spin and valley in devices, it is necessary to actively control their electronic
and magnetic properties at the nanoscale. 1.2.1 Graphene and 2D materials
Recent years have seen significant progress in the study of graphene-related
nanoscale materials, which are anticipated to lead to the development of various
future technologies [20, 21]. Graphene is a single layer of carbon atoms arranged
in a hexagonal lattice (cf. Fig.

and manipulate quantum information [17, 18]. The valley degree of freedom can
also be used to create new types of electronic and optoelectr

In [18]:
chunks_directory = {}
with open("chunks_data.pkl" , 'rb') as f:
    chunks_directory = pickle.load(f)


In [19]:
chunks_directory.keys()

dict_keys(['Empirical Model for ratio of Conductivity.pdf', 'nature12952.pdf', 'p7.pdf', 'nature05180.pdf', 'p6.pdf', 'p4.pdf', 'nature09211.pdf', 'effective-medium-network.pdf', 'p2.pdf', 'p1.pdf', 'p5.pdf', 'Thesis.pdf', 'nmat3687.pdf', 'p8.pdf', 'nature17151.pdf', 'Gabbett et al sub to nat mat.pdf', '1806-1117-rbef-39-01-e1303.pdf', 'nature13831.pdf', 'shardul_mukim_thesis.pdf', 'p3.pdf'])