In [1]:
# !pip install langchain-chroma

In [2]:
import os

os.environ["HF_HOME"] = "/space/hotel/phit/personal/experiments/weights"
os.environ["TORCH_HOME"] = "/space/hotel/phit/personal/experiments/weights"

from typing import List, Optional, Union

from langchain.callbacks import FileCallbackHandler
from langchain.retrievers import ContextualCompressionRetriever, ParentDocumentRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredFileLoader, JSONLoader
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS , Chroma
# from langchain_chroma import Chroma
from langchain_core.documents import Document
from loguru import logger
from rich import print
from sentence_transformers import CrossEncoder

from unstructured.cleaners.core import clean_extra_whitespace, group_broken_paragraphs

logfile = "log/output.log"
logger.add(logfile, colorize=True, enqueue=True)
handler = FileCallbackHandler(logfile)


persist_directory = None




class RAGException(Exception):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)


def rerank_docs(reranker_model, query, retrieved_docs):
    query_and_docs = [(query, r.page_content) for r in retrieved_docs]
    scores = reranker_model.predict(query_and_docs)
    return sorted(list(zip(retrieved_docs, scores)), key=lambda x: x[1], reverse=True)


def load_pdf(
    files: Union[str, List[str]] = "../data/cv/Bui Tien Phat resume (1).pdf"
) -> List[Document]:
    if isinstance(files, str):
        loader = UnstructuredFileLoader(
            files,
            post_processors=[clean_extra_whitespace, group_broken_paragraphs],
        )
        return loader.load()

    loaders = [
        UnstructuredFileLoader(
            file,
            post_processors=[clean_extra_whitespace, group_broken_paragraphs],
        )
        for file in files
    ]
    docs = []
    for loader in loaders:
        docs.extend(
            loader.load(),
        )
    return docs

VIETNAMWORKS = ['_id', 'url', 'job_name', 'company_name', 'salary', 'end_date',
       'address', 'posted_date', 'job_function', 'job_industry', 'job_level',
       'skill', 'preferred_language', 'job_description'] # job_requirements

TOPCV = ['_id', 'urls', 'job_name', 'company_name', 'address', 'salary',
       'remaining', 'job_description', 'benefits',
       'application_method', 'level', 'experience', 'number_of_recruitment',
       'work_form', 'gender', 'working_time'] # job_requirements

# Define the metadata extraction function.
def metadata_func(record: dict, metadata: dict) -> dict:
    
    # metadata["urls"] = record.get("urls")
    # metadata["job_name"] = record.get("job_name")
    # metadata["company_name"] = record.get("company_name")
    # metadata["address"] = record.get("address")
    # metadata["salary"] = record.get("salary")
    # metadata["remaining"] = record.get("remaining")
    # # metadata[""] = record.get("Mô tả công việc")
    # metadata["Yêu cầu ứng viên"] = record.get("Yêu cầu ứng viên")
    # metadata["Quyền lợi"] = record.get("Quyền lợi")
    # metadata["Địa điểm làm việc"] = record.get("Địa điểm làm việc")
    # metadata["Cách thức ứng tuyển"] = record.get("Cách thức ứng tuyển")
    # metadata["Cấp bậc"] = record.get("Cấp bậc")
    # metadata["Kinh nghiệm"] = record.get("Kinh nghiệm")
    # metadata["Số lượng tuyển"] = record.get("Số lượng tuyển")
    # metadata["Hình thức làm việc"] = record.get("Hình thức làm việc")
    # metadata["Giới tính"] = record.get("Giới tính")
    
    for key in VIETNAMWORKS:
        metadata[key] = record.get(key)
    
    metadata = {key: f'{" ".join(val[0]) if isinstance(val, list) else val}' for key, val in metadata.items()}

    

    return metadata


def load_jsonl(
    files: Union[str, List[str]] = "../data/crawl/train_test.jsonl"
) -> List[Document]:
    if isinstance(files, str):
        loader = JSONLoader(
            files,
            json_lines=True,
            jq_schema='.',
            content_key="job_requirements", 
            text_content=False,
            metadata_func=metadata_func
        )
        return loader.load()

    loaders = [
        JSONLoader(
            file,
            json_lines=True,
            jq_schema='.messages[]',
            content_key="content",
            metadata_func=metadata_func
        )
        for file in files
    ]
    docs = []
    for loader in loaders:
        docs.extend(
            loader.load(),
        )
    return docs

def create_parent_retriever(
    docs: List[Document], embeddings_model: HuggingFaceBgeEmbeddings()
):
    parent_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n\n", "\n\n"],
        chunk_size=2000,
        length_function=len,
        is_separator_regex=False,
    )

    # This text splitter is used to create the child documents
    child_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n\n", "\n\n"],
        chunk_size=1000,
        chunk_overlap=300,
        length_function=len,
        is_separator_regex=False,
    )
    # The vectorstore to use to index the child chunks
    vectorstore = Chroma(
        collection_name="split_documents",
        embedding_function=embeddings_model,
        persist_directory=persist_directory,
    )#.as_retriever()
    
    # vectorstore = Chroma.from_documents(docs, embeddings_model)#.as_retriever()
    print("vectorstore: ", vectorstore)
    # The storage layer for the parent documents
    store = InMemoryStore()
    retriever = ParentDocumentRetriever(
        vectorstore=vectorstore,
        docstore=store,
        child_splitter=child_splitter,
        parent_splitter=parent_splitter,
        k=10,
    )
    retriever.add_documents(docs)
    return retriever


def retrieve_context(query, retriever, reranker_model):
    retrieved_docs = retriever.get_relevant_documents(query)

    if len(retrieved_docs) == 0:
        raise RAGException(
            f"Couldn't retrieve any relevant document with the query `{query}`. Try modifying your question!"
        )
    reranked_docs = rerank_docs(
        query=query, retrieved_docs=retrieved_docs, reranker_model=reranker_model
    )
    return reranked_docs

# https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/BGE_M3
def load_embedding_model(
    model_name: str = "BAAI/bge-large-en-v1.5", device: str = "cuda"
) -> HuggingFaceBgeEmbeddings:
    model_kwargs = {"device": device}
    encode_kwargs = {
        "normalize_embeddings": True
    }  # set True to compute cosine similarity
    embedding_model = HuggingFaceBgeEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
    )
    return embedding_model

# https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/llm_reranker
def load_reranker_model(
    reranker_model_name: str = "BAAI/bge-reranker-large", device: str = "cuda"
) -> CrossEncoder:
    reranker_model = CrossEncoder(
        model_name=reranker_model_name, max_length=512, device=device
    )
    return reranker_model


def main(
    file: str = "../data/crawl/train_test.jsonl",
    query: Optional[str] = None,
    llm_name="mistral",
):
    # docs = load_pdf(files=file)
    docs = load_jsonl("../data/crawl/vnw.jsonl")
    # print(docs)

    embedding_model = load_embedding_model()
    retriever = create_parent_retriever(docs, embedding_model)
    reranker_model = load_reranker_model()

    context = retrieve_context(
        query, retriever=retriever, reranker_model=reranker_model
    )[0]
    print("context:\n", context, "\n", "=" * 50, "\n")


# if __name__ == "__main__":
#     # from jsonargparse import CLI

#     # CLI(main)
#     # main(query="What is the job description for Network Engineer?")
#     main(query="What is job description in Da Nang ?")
    

In [3]:
docs = load_jsonl('../data/preprocessed/preprocessed_vnw.jsonl')
# print(docs)

embedding_model = load_embedding_model(model_name="BAAI/bge-m3")
retriever = create_parent_retriever(docs, embedding_model)
reranker_model = load_reranker_model(reranker_model_name="BAAI/bge-reranker-v2-m3")

In [4]:
import os
files = ["../data/cv/" + file for file in os.listdir("../data/cv")]
files

['../data/cv/CV Cap Tan Dat - Intern Developer.pdf',
 '../data/cv/CV_SonBao_DS.pdf',
 '../data/cv/Bui Tien Phat resume (1).pdf',
 '../data/cv/CV Nguyễn việt hoàng - CV lập trình-TopCV.vn.pdf',
 '../data/cv/CV _ Đinh Tuấn Nam_BE Developer(.NET)_HCM.pdf',
 '../data/cv/CV_MaiHuyKhang_DA.pdf',
 '../data/cv/CV_NGUYEN_MINH_SON.pdf']

In [5]:
cv = load_pdf(files)

This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSe

In [10]:
print(cv[3].page_content)

In [11]:
# query = "What are some job descriptions for Data?"
query = cv[3].page_content
context = retrieve_context(
    query, retriever=retriever, reranker_model=reranker_model
)[:]
print("context:\n", context, "\n", "=" * 50, "\n")