In [43]:
# pip install langchain_community
# conda create -n faiss_env python=3.12

# Import necessary libraries
from langchain_core.output_parsers import StrOutputParser
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv


In [None]:

load_dotenv()

api_key = ''

llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.3, api_key=api_key)
llm.invoke("what is langchain")

In [46]:
# Auf einere PDF-Datei
def load_documents(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    return documents

pdf_path = '/Users/tom/Downloads/Netzneutralitaet.pdf'

documents = load_documents(pdf_path)

In [None]:
# Von einem Ordner
import os
from langchain.document_loaders import PyPDFLoader  # Falls PyPDFLoader nur über LangChain verfügbar ist

def load_documents_from_folder(folder_path):
    documents = []
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    found_paths = []
    
    for pdf_file in pdf_files:
        file_path = os.path.join(folder_path, pdf_file)
        found_paths.append(file_path)  # Pfad zur Liste der gefundenen Pfade hinzufügen
        
        # Versuchen, das PDF mit PyPDFLoader zu laden
        try:
            loader = PyPDFLoader(file_path)
            documents.extend(loader.load())
        except Exception as e:
            print(f"Fehler beim Laden von {file_path}: {e}")
    
    # Am Ende alle gefundenen Pfade ausgeben
    print("Gefundene PDF-Dateien:")
    for path in found_paths:
        print(path)
    
    return documents

# Beispielordner-Pfad
folder_path = '/Users/tom/Dropbox/Text/'

documents = load_documents_from_folder(folder_path)


In [None]:
# Split documents into smaller chunks
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=100)
    split_docs = text_splitter.split_documents(documents)
    print(len(split_docs))
    return split_docs

split_docs = split_documents(documents)

In [50]:
#pip install faiss-cpu / faiss-gpu
api_key = 'AIzaSyDMb9OfWYCeQncqbtBeODwkV8YQPvMivWQ'
from langchain.vectorstores import FAISS

def get_vector_store(text_chunks):
    texts = [chunk.page_content for chunk in text_chunks]  # Extract text content from the chunks
    embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001",google_api_key=api_key)
    vector_store = FAISS.from_texts(texts, embedding=embeddings)
    vector_store.save_local("faiss_index")

get_vector_store(split_docs)

In [None]:
def retreive_context(user_question):
    embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001",google_api_key=api_key)
    new_db = FAISS.load_local("faiss_index", embeddings,allow_dangerous_deserialization=True)
    docs = new_db.similarity_search(user_question)
    print(docs)
    return docs

retreive_context("Which topics are covered in the book")

In [53]:
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate

messages = [
    ("system", "You are an assistant for question-answering tasks."),
    ("human", "Use the following pieces of retrieved context to answer the question. \n\
    If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. \n\
    Question: {question} \n\
    Context: {context} \n \
    Answer:")
]

prompt = ChatPromptTemplate.from_messages(messages)



def format_docs(docs):
    return docs[0].page_content


rag_chain = (
    {"context": RunnableLambda(lambda x: retreive_context(x)) | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



In [None]:
rag_chain.invoke("Wer ist Bill Mollison")