In [94]:
import os
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter 
from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFacePipeline
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_docling import DoclingLoader
from langchain_chroma import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.memory import ConversationBufferMemory
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain, create_history_aware_retriever, ConversationalRetrievalChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [90]:
MODEL = "meta-llama/Llama-3.2-1B"
db_name = "vector_db"

In [None]:
FILE_PATH = ["docs/cover_letter.docx","docs/MelihOzguvenc_Resume.docx"]

loader = DoclingLoader(file_path=FILE_PATH)
docs = loader.load()

In [69]:
splitter = TokenTextSplitter (chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(docs)
chunks = filter_complex_metadata(chunks)

In [None]:
embeddings = HuggingFaceEmbeddings()

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL)
pipe = pipeline(
    task = "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=100
)

llm = HuggingFacePipeline(pipeline=pipe)

In [95]:
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

retriever = vectorstore.as_retriever()

conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
user_prompt = "who is melih"

conversation_chain.invoke({"question": user_prompt})