In [2]:
# Set up OpenAI API key
'''
# Load environment variables from .env file
load_dotenv()

# Verify API key is available
requiredVars = ["OPENAI_API_KEY", "PINECONE_API_KEY"]
missingVars = [var for var in required_vars if not os.getenv(var)]

if missingVars:
    
    os.environ["OPENAI_API_KEY"] = "***your-api-key***"
    os.environ["PINECONE_API_KEY"] = "***your-api-key***"
    os.environ["PINECONE_ENVIRONMENT"] = "***your-api-key***"
    

    Raise ValueError(
        f"Missing required env variables: {', '.join(missingVars)}\n"
    )'''

SyntaxError: invalid syntax (2527142179.py, line 17)

In [3]:
# Imports

import os
import time
from dotenv import load_dotenv
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_openai import OpenAI, OpenAIEmbeddings, ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone as pn, ServerlessSpec
from uuid import uuid4



In [4]:
def env_setup():

    """
        Set up required environment variables: OpenAI API key
    """
    
    load_dotenv()

    # Verify API key is available
    requiredVars = ["OPENAI_API_KEY", "PINECONE_API_KEY"]
    missingVars = [var for var in requiredVars if not os.getenv(var)]
    
    if missingVars:
        '''
        os.environ["OPENAI_API_KEY"] = "***your-api-key***"
        os.environ["PINECONE_API_KEY"] = "***your-api-key***"
        '''
    
        raise ValueError(
            f"Missing required env variables: {', '.join(missingVars)}\n"
        )

In [5]:
# 1. Create sample knowledge base
def create_sample_data():
    """Create a sample text file with information about AI"""
    sample_text = """
    Artificial Intelligence (AI) is the simulation of human intelligence by machines.
    Machine Learning is a subset of AI that enables systems to learn from data.
    Deep Learning is a type of machine learning based on artificial neural networks.
    Natural Language Processing (NLP) is a branch of AI that helps computers understand human language.
    Computer Vision is the field of AI that enables computers to understand and process visual information.
    """
    
    with open("ai_knowledge.txt", "w") as f:
        f.write(sample_text)

In [4]:
# 2. Load and process documents
def load_and_process_documents(file_path):
    """Load and split documents into chunks"""
    # Load document
    loader = TextLoader(file_path)
    documents = loader.load()
    
    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        length_function=len
    )
    chunks = text_splitter.split_documents(documents)
    
    return chunks

In [5]:
# 3. Create vector store
def create_vector_store(chunks):
    """Create a FAISS vector store from document chunks"""
    # Initialize embeddings
    embeddings = OpenAIEmbeddings()
    
    # Create vector store
    vector_store = FAISS.from_documents(chunks, embeddings)
    
    return vector_store

In [6]:
# 4. Set up RAG chain
def setup_rag_chain(vector_store):
    """Create a retrieval chain using the vector store"""
    # Create LLM
    llm = ChatOpenAI(model="gpt-4o-mini")
    
    # Create custom prompt template
    prompt_template = """
    Use the following pieces of context to answer the question at the end. 
    If you don't know the answer based on the context, just say you don't know.
    Don't try to make up an answer.
    
    Context: {context}
    
    Question: {question}
    
    Answer:"""
    
    PROMPT = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )
    
    # Create chain
    chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(),
        chain_type_kwargs={"prompt": PROMPT}
    )
    
    return chain

In [7]:
# 5. Query function
def query_rag(chain, question):
    """Query the RAG system"""
    response = chain.invoke(question)
    return response

In [8]:
def run_rag():
    
    # Environment setup
    env_setup()
    
    # Create sample data
    create_sample_data()
    
    # Process documents
    chunks = load_and_process_documents("ai_knowledge.txt")
    
    # Create vector store
    vector_store = create_vector_store(chunks)
    
    # Setup RAG chain
    chain = setup_rag_chain(vector_store)
    
    # Test queries
    test_questions = [
        "What is Artificial Intelligence?",
        "How is Machine Learning related to AI?",
        "What is the purpose of NLP?"
    ]
    
    for question in test_questions:
        print(f"\nQuestion: {question}")
        response = query_rag(chain, question)
        print(f"Answer: {response['result']}")
    

In [9]:
run_rag()


Question: What is Artificial Intelligence?
Answer: Artificial Intelligence (AI) is the simulation of human intelligence by machines.

Question: How is Machine Learning related to AI?
Answer: Machine Learning is a subset of AI that enables systems to learn from data.

Question: What is the purpose of NLP?
Answer: The purpose of NLP is to help computers understand human language.


In [46]:
from pinecone import Pinecone as PineconeClient
from langchain.vectorstores import Pinecone

def get_create_pinecone_index(index_name="aygo-langchain-index2"):

    pinecone_api_key = os.environ.get("PINECONE_API_KEY")
    pc =  PineconeClient(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    index_name = "aygo-langchain-index2"
    
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        )
        while not pc.describe_index(index_name).status["ready"]:
            time.sleep(1)
    
    return pc.Index(index_name)

In [47]:
def create_vector_store_pinecone(chunks, index_name="aygo-langchain-index"):
    """Create a Pinecone vector store from document chunks"""
    
    """Create a Pinecone vector store from document chunks"""
    # Initialize embeddings
    embeddings = OpenAIEmbeddings()
    
    # Create or get Pinecone index
    index = get_create_pinecone_index("aygo-langchain-index2")
    print(index.describe_index_stats())
    

    # Extract texts and metadata
    texts = [doc.page_content for doc in chunks]
    metadatas = [doc.metadata for doc in chunks]
    
    # Generate embeddings
    embeddings_list = embeddings.embed_documents(texts)
    
    # Create vector store
    vector_store = langchain.vectorstores.Pinecone.from_documents(
        index=index,
        embedding=embeddings,
        text_key="text"  # The key for the text field in metadata
    )
    
    # Add documents to the vector store
    for i, (text, embedding) in enumerate(zip(texts, embeddings_list)):
        metadata = metadatas[i]
        metadata["text"] = text  # Add text to metadata
        index.upsert(
            vectors=[
                {
                    "id": f"doc_{i}",
                    "values": embedding,
                    "metadata": metadata
                }
            ]
        )
    
    return vector_store

In [48]:
from pinecone import Pinecone as PineconeClient
from langchain.vectorstores import Pinecone

def create_vector_store_pinecone(chunks, index_name="aygo-langchain-index2"):
    """Create a Pinecone vector store from document chunks"""
    
    """Create a Pinecone vector store from document chunks"""
    # Initialize embeddings
    embeddings = OpenAIEmbeddings()
    
    # Create or get Pinecone index
    index = get_create_pinecone_index("aygo-langchain-index2")
    print(index.describe_index_stats())
    

    # Extract texts and metadata
    texts = [doc.page_content for doc in chunks]
    metadatas = [doc.metadata for doc in chunks]
    
    # Generate embeddings
    embeddings_list = embeddings.embed_documents(texts)
    
    # Create vector store
    vector_store = Pinecone.from_documents(
        index_name=index_name,
        embedding=embeddings,
        documents=chunks  # The key for the text field in metadata
    )
    
  
    return vector_store

In [49]:
from pinecone import Pinecone, ServerlessSpec
import pinecone
from langchain.vectorstores import Pinecone

# Main execution
def run_rag_pinecone():

    # Environment setup
    env_setup()
    
    # Create sample data
    create_sample_data()
    
    # Process documents
    chunks = load_and_process_documents("ai_knowledge.txt")
    
    # Create vector store
    vector_store = create_vector_store_pinecone(chunks)
    
    # Setup RAG chain
    chain = setup_rag_chain(vector_store)
    
    # Test queries
    test_questions = [
        "What is Artificial Intelligence?",
        "How is Machine Learning related to AI?",
        "What is the purpose of NLP?"
    ]
    
    for question in test_questions:
        print(f"\nQuestion: {question}")
        response = query_rag(chain, question)
        print(f"Answer: {response['result']}")

In [50]:
run_rag_pinecone()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1}},
 'total_vector_count': 1}

Question: What is Artificial Intelligence?
Answer: Artificial Intelligence (AI) is the simulation of human intelligence by machines.

Question: How is Machine Learning related to AI?
Answer: Machine Learning is a subset of AI that enables systems to learn from data.

Question: What is the purpose of NLP?
Answer: The purpose of NLP (Natural Language Processing) is to help computers understand human language.


In [21]:
# Main execution
def main():

    # Environment setup
    env_setup()
    
    # Create sample data
    create_sample_data()
    
    # Process documents
    chunks = load_and_process_documents("ai_knowledge.txt")
    
    # Create vector store
    vector_store = create_vector_store(chunks)
    
    # Setup RAG chain
    chain = setup_rag_chain(vector_store)
    
    # Test queries
    test_questions = [
        "What is Artificial Intelligence?",
        "How is Machine Learning related to AI?",
        "What is the purpose of NLP?"
    ]
    
    for question in test_questions:
        print(f"\nQuestion: {question}")
        response = query_rag(chain, question)
        print(f"Answer: {response['result']}")

if __name__ == "__main__":
    main()

AttributeError: from_documents is not a top-level attribute of the Pinecone class provided by pinecone's official python package developed at https://github.com/pinecone-io/pinecone-python-client. You may have a name collision with an export from another dependency in your project that wraps Pinecone functionality and exports a similarly named class. Please refer to the following knowledge base article for more information: https://docs.pinecone.io/troubleshooting/pinecone-attribute-errors-with-langchain


In [12]:
if __name__ == "__main__":
    main()


Question: What is Artificial Intelligence?
Answer: Artificial Intelligence (AI) is the simulation of human intelligence by machines.

Question: How is Machine Learning related to AI?
Answer: Machine Learning is a subset of AI that enables systems to learn from data.

Question: What is the purpose of NLP?
Answer: The purpose of Natural Language Processing (NLP) is to help computers understand human language.
