In [177]:
from langchain_ollama.llms import OllamaLLM
from langchain_ollama import OllamaEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
import os

In [178]:
model = OllamaLLM(model = "deepseek-r1:8b")
embeddings = OllamaEmbeddings(model = "deepseek-r1:8b")

In [179]:
# doc_page as key, page string as value
def generate_page_doc_pair(loaded_document):
    doc_pg_metadata = [pg.metadata for pg in loaded_document]
    doc_pg_content = [pg.page_content for pg in loaded_document]
    doc_keys = [f"{metadata['source']}_{metadata['page']}" for metadata in doc_pg_metadata]
    return {pg_id: pg_doc for pg_id, pg_doc in zip(doc_keys, doc_pg_content)}

# doc_page_num as key, token string as value
def generate_pageidnum_tokenstr_pair(pageid_doc_d, chunksize = 100, chunk_overlap = 10):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunksize,
        chunk_overlap = chunk_overlap
    )
    pageidnum_tokenstr_d = {f'{pageid}_{idx}': tokenstr\
                            for pageid, pagestring in pageid_doc_d.items()\
                            for idx, tokenstr in enumerate(text_splitter.split_text(pagestring))}
    return pageidnum_tokenstr_d

'''
uid contains info in the following format <filename>_<pg_num>_<token_num>
present in the metadata as well for ease of retrieval
'''
def generate_pageidnum_doc_lst(pageid_doctokenstr_d):
    return [Document(
            page_content = txt,
            metadata = dict(zip(['filepath', 'page_num', 'token_num'], uid.split('_'))), 
            id = uid
            ) 
            for uid, txt in pageid_doctokenstr_d.items()]

def obtain_tokenid_from_doc(token_lst):
    return [token.id for token in token_lst]

def add_doc_to_vector_db(vector_store, token_id, token_lst):
    vector_store.add_documents(documents = token_lst, ids = token_id)

def generate_llm_response(question, model, template, retriever):
    chain = (
        {"context": retriever, "question": RunnablePassthrough()} |
        ChatPromptTemplate.from_template(template) |
        model |
        StrOutputParser()
    )
    return chain.invoke(question)

In [180]:
vector_store = Chroma(
    collection_name = "esg_report_collection",
    embedding_function = embeddings,
    persist_directory = "./chroma")

# if persistent directory is not empty
if not vector_store.get()['ids']:
    vector_store.delete_collection()

pdf_loc = 'data/'
for file in os.listdir(pdf_loc):
    loader = PyPDFLoader(file_path = f"./{pdf_loc}/{file}")
    docs = loader.load()
    pageid_docstring_dict = generate_page_doc_pair(docs)
    pageid_doctokenstr_dict = generate_pageidnum_tokenstr_pair(pageid_docstring_dict, 1000, 100)
    token_lst = generate_pageidnum_doc_lst(pageid_doctokenstr_dict)
    token_id = obtain_tokenid_from_doc(token_lst)
    add_doc_to_vector_db(vector_store, token_id, token_lst)

In [None]:
retriever = vector_store.as_retriever(search_type = "mmr",
                                      search_kwargs = {"fetch_k": 10, "k": 5})

template = '''
You are an AI assistant that specialises in extracting ESG information from ESG reports. 
You have to use information strictly from the vector store that I provided you with
in the form of a retriever.
Question:
{question}

Context provided by company's ESG report:
{context}

Please respond in the following JSON format:
{{
    "Name of ESG metric": <name of the ESG metrics that you found here>
    "Description of ESG metric or exact ESG metric value": <your metric here>
    "metadata": <metadata of the document retrieved from the vector store (filename, page num etc.)>
}}
'''

response = generate_llm_response('what is the reduction in water consumption for tesla in 2023?',
                      model,
                      template,
                      retriever)