In [11]:
pip show langchain

Name: langchain
Version: 0.3.26
Summary: Building applications with LLMs through composability
Home-page: 
Author: 
Author-email: 
License: MIT
Location: C:\Users\thota\gen_ai\genai\Lib\site-packages
Requires: langchain-core, langchain-text-splitters, langsmith, pydantic, PyYAML, requests, SQLAlchemy
Required-by: langchain-community
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(),override=True)

def load_doc(file):
    import os
    from langchain.document_loaders import PyPDFLoader
    name,extension=os.path.splitext(file)

    if extension=='.pdf':
        loader=PyPDFLoader(file)
    elif extension=='.docx':
        from langchain.document_loaders import Docx2txtLoader
        loader=Docx2txtLoader(file)
    else:
        print('document format not supported')
        return None
    
    data=loader.load()
    return data

def load_from_wiki(query,lang='en',load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader=WikipediaLoader(query=query,lang=lang,load_max_docs=load_max_docs)
    data=loader.load()
    return data

In [2]:
data=load_doc(r"C:\Users\thota\Downloads\WINSEMFRE2024-25_CHY1009_ETH_AP2024255000612_2025-04-19_Reference-Material-I.pdf")
def chunk_data(data,chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=0)
    chunks=text_splitter.split_documents(data)
    return chunks
chunks=chunk_data(data)

In [6]:
import os
from dotenv import find_dotenv,load_dotenv
load_dotenv(find_dotenv(),override=True)
def insert_or_fetch_embeddings(index_name,chunks):
    import pinecone 
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import ServerlessSpec
    
    pc=pinecone.Pinecone()
    embeddings=OpenAIEmbeddings(model='text-embedding-3-small',dimensions=1536)
    
    if index_name in pc.list_indexes().names():
        print(f'index {index_name} alredy exists!')
        vector_store=Pinecone.from_existing_index(index_name,embeddings)
        print('ok')
    else:
        print(f'creating {index_name} index...')
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=ServerlessSpec(
                cloud="aws",
                region='us-east-1'
            )
        )
        vector_store=Pinecone.from_documents(chunks,embeddings,index_name=index_name)
        print('index created')
    return vector_store

def del_pinecone_index(index_name='all'):
    import pinecone
    pc=pinecone.Pinecone()
    if index_name=='all':
        indexes=pc.list_indexes().names()
        for index in indexes:
            pc.delete_index(index)
        print('deleted!')
    else:
        pc.delete_index(index_name)
        print(f'deleted {index_name}!')
# del_pinecone_index('all')

In [2]:
!pip install -q pinecone==5.4.0

In [7]:
index_name='askadocument'
vector_store=insert_or_fetch_embeddings(index_name,chunks)

index askadocument alredy exists!
ok


In [27]:
print(len(chunks))

158


## Asking questions from doc

In [None]:
def ask_and_get_ans(vector_store,q):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm=ChatOpenAI(model='gpt-4o',temperature=1)
    retriever=vector_store.as_retriever(search_type='similarity',search_kwargs={'k':5})
    chain=RetrievalQA.from_chain_type(llm=llm,chain_type='stuff',retriever=retriever)

    answer=chain.invoke(q)
    return answer
answer=ask_and_get_ans(vector_store,'what is whole pdf about?')
print(answer['result'])

In [7]:
import time
i=1
while True:
    q=input(f'Question {i}: ')
    i+=1
    if q.lower() in ['quit','exit']:
        time.sleep(2)
        break
    ans=ask_and_get_ans(vector_store,q)
    print(f'Answer: {ans['result']}')
    print('*'*100)

Question 1:  hi


Answer: Hello! How can I assist you today?
****************************************************************************************************


Question 2:  what is in the 10 th page of this doc in short


Answer: I'm sorry, but I don't have access to the content of the 10th page of the document you're referring to.
****************************************************************************************************


Question 3:  summary of this whole content


Answer: The content refers to several related points: 

1. It mentions two problems labeled "Problem 1" and "Problem 2" without further details.
2. It provides an explanation of polymers, stating that the term "polymer" comes from Greek words meaning "many parts" and describes a polymer as a long-chain molecule composed of repeating units.
3. It describes a specific type of polymer, Acrylonitrile-Butadiene-Styrene (ABS), which is a large molecule consisting mainly of polybutadiene chains with styrene-acrylonitrile copolymer branches.
****************************************************************************************************


Question 4:  exit


In [31]:
data1=load_from_wiki('ChatGPT')
chunks=chunk_data(data1)
index_name='chatgpt'
vector_store=insert_or_fetch_embeddings(index_name,chunks)
# del_pinecone_index()

creating chatgpt index...
index created


In [6]:
anwer=ask_and_get_ans(vector_store,'all important words according to you from this content')
print(anwer['result']) 
# print(data1)

NameError: name 'ask_and_get_ans' is not defined

## Using chroma

In [2]:
pip install -q chromadb

Note: you may need to restart the kernel to use updated packages.


In [17]:
def create_embdng_chroma(chunks,persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    embeddings=OpenAIEmbeddings(model='text-embedding-3-small',dimensions=1536)
    vector_store=Chroma.from_documents(chunks,embeddings,persist_directory=persist_directory)
    return vector_store

def load_embeddings_chroma(persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    embeddings=OpenAIEmbeddings(model='text-embedding-3-small',dimensions=1536)
    vector_store=Chroma(persist_directory=persist_directory,embedding_function=embeddings)
    return vector_store
    

data1=load_doc(r"C:\Users\thota\Downloads\OnePlus_5_User_Manual.pdf")
chunks=chunk_data(data1)
vector_store=create_embdng_chroma(chunks)

In [38]:
q='safety precautions?'
ans=ask_and_get_ans(vector_store,q)
print(ans['result'])

Here are some safety precautions you should follow:

1. Do not switch on the device when prohibited or when it may cause interference or danger.
2. When operating a vehicle, prioritize safety and obey related regulations in the area or country you are driving in.
3. Do not store or carry your mobile phone or accessories near liquid or flammable gas and avoid placing them near safety air bags.
4. Avoid using the phone if the antenna is damaged, as it may cause injury.

Following these guidelines will help prevent injury to your device, property, yourself, or others.


In [5]:
db=load_embeddings_chroma()
q='main points?'
ans=ask_and_get_ans(vector_store,q)
print(ans['result'])

  vector_store=Chroma(persist_directory=persist_directory,embedding_function=embeddings)


NameError: name 'ask_and_get_ans' is not defined

## Adding Memory

In [8]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
import os
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(),override=True)
llm=ChatOpenAI(model_name='gpt-4-turbo-preview',temperature=0)
retriever=vector_store.as_retriever(search_type='similarity',search_kwargs={'k':5})
memory=ConversationBufferMemory(memory_key='chat_history',return_messages=True)
crc=ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type='stuff',
    verbose=True
)


  memory=ConversationBufferMemory(memory_key='chat_history',return_messages=True)


In [9]:
def ask_question(q,chain):
    result=chain.invoke({'question':q})
    return result

In [10]:
q='How many subtopics are in it?'
result1=ask_question(q,crc)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
How many pendant are in this polymer?
Pendant means side chain

Module 4: Polymer
• The word polymer is 
derived from the classical 
Greek words poly meaning 
“many” and meres 
meaning “parts.” 
• Simply stated, a polymer is 
a long-chain molecule that 
is composed of a large 
number of repeating units

PolydispersityIndex (PDI)
• Which one of the following has high polydispersity
index?
16

General Trends of Tg
Tg
 Strength of intermolecular force (or appropriately interchain force)
Tg
 Degree of crystallinity 
Tg
 Degree of cross-linking 
Tg
Flexibility of a polymer main chain and branches 
1
Tg

It is  a giant molecule predominantly made up of chains of polybutadiene b

In [17]:
q='what is main content in that?'
result=ask_question(q,crc)
print(result['answer'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: How many subtopics are in it?
Assistant: Based on the provided text, there are 34 subtopics listed under "What’s in The Box?" section.
Human: How many subtopics are in it?
Assistant: There is 1 subtopic listed under the "What’s in The Box?" section.
Human: How many subtopics are in it?
Assistant: There is 1 subtopic listed under the "What’s in The Box?" section.
Human: How many subtopics are in it?
Assistant: There is only one subtopic listed under the "What’s in The Box?" section.
Follow Up Input: what is main content in that?
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pie

In [11]:
print(result1['answer'])

Based on the provided context, there are four subtopics within the module on polymers. These are:

1. Definition of a polymer.
2. Polydispersity Index (PDI).
3. General Trends of Tg (Glass Transition Temperature).
4. Description of Acrylonitrile-Butadiene-Styrene (ABS) as a specific example of a polymer.


In [13]:
for item in result1['chat_history']:
    print(item)

content='How many subtopics are in it?' additional_kwargs={} response_metadata={}
content='Based on the provided context, there are four subtopics within the module on polymers. These are:\n\n1. Definition of a polymer.\n2. Polydispersity Index (PDI).\n3. General Trends of Tg (Glass Transition Temperature).\n4. Description of Acrylonitrile-Butadiene-Styrene (ABS) as a specific example of a polymer.' additional_kwargs={} response_metadata={}


In [18]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
import os
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(),override=True)
llm=ChatOpenAI(model_name='gpt-4-turbo-preview',temperature=0)
retriever=vector_store.as_retriever(search_type='similarity',search_kwargs={'k':5})
memory=ConversationBufferMemory(memory_key='chat_history',return_messages=True)
system_template=r'''
Use the following pieces of context to answer user's question.
---------------------
Context:```{context}```
'''

user_template='''
Question:```{question}```
Chat History: ```{chat_history}```
'''

messages=[
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template(user_template)
]

qa_prompt=ChatPromptTemplate.from_messages(messages)


crc=ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type='stuff',
    combine_docs_chain_kwargs={'prompt':qa_prompt},
    verbose=True
)

db=load_embeddings_chroma()
q='Main examples of the monomers'
result=ask_question(q,crc)
print(result)

  vector_store=Chroma(persist_directory=persist_directory,embedding_function=embeddings)




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: 
Use the following pieces of context to answer user's question.
---------------------
Context:```Monomers
Monomers: These are simple molecules, which combine with 
each other to form polymers. Monomers are also called 
“building blocks” of polymers. 
For example, ethylene, methylmethacrylate,  vinyl chloride, 
etc.

• The monomer is the repeat unit of the polymer, and a 
typical polymer may have from hundreds to hundreds 
of thousands of repeat units. 
• Synthetic polymers are created by chemical reactions 
in the laboratory; natural polymers (or biopolymers)

2. In simple addition polymerization, n-monomers are added together to 
form a polymer chain with n-repeat units, n is very large 100s or 1000s
Monomer Polymer
(PMMA) 
Salient Features of Polymerization Reaction:

Conducting polymer structures
Polymers with alternating double bonds can con

In [19]:
print(result['answer'])

The main examples of monomers mentioned are:

- Ethylene
- Methyl methacrylate
- Vinyl chloride
