In [15]:

from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
import json
import os

#read the markdown  
directory_path = '/Users/feng/Documents/Lab/Optimzer/OPT-imize_v2/info_sourse'
chunk_docs = []

for filename in os.listdir(directory_path):
    if filename.endswith('.md'):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            markdown_content = file.read()
        text_splitter = SemanticChunker(OpenAIEmbeddings())

        # print(markdown_content)
        chunks = text_splitter.create_documents([markdown_content])
        chunk_docs.append(chunks)


docs_json = []
for chunks in chunk_docs:
    for doc in chunks:
        docs_json.append(doc.to_json())

#output the document to a file
output_file_path = "./rag_doc.json"
with open(output_file_path, "w") as file:
    json.dump(docs_json, file, indent=4)
print(chunk_docs)
print(chunks)
print("\n")




[[Document(metadata={}, page_content='# Navigating the Three-Part Interview Process\n\nThe interview process has remained relatively unchanged for generations, and while it continues to be a crucial part of hiring, understanding the structure can significantly help job seekers know what to expect. More importantly, it helps you know when to emphasize specific aspects of your background. There’s often a misconception that if you impress the HR team, you’re well on your way to landing the job. But in reality, that’s only one part of the process. Let’s break the interview process down into three major components: **HR, the Hiring Team, and Senior Leadership**. ### **Part 1: The HR Interview**\n\nOnce your resume passes the [**Applicant Tracking System**](https://opt-imize.com/blog/creating-an-ats-friendly-resume/) **(ATS)**, the first person to contact you is often someone from the HR team—typically a recruiter or HR coordinator. At this stage, HR’s role is less about evaluating your tech

In [None]:
# from langchain_text_splitters import MarkdownHeaderTextSplitter

# # Chunk the document based on h2 headers.
# markdown_file_path = "./rag_doc.md"
# with open(markdown_file_path, "r") as file:
#     markdown_document = file.read()
    
# headers_to_split_on = [
#     ("##", "Header 2")
# ]

# markdown_splitter = MarkdownHeaderTextSplitter(
#     headers_to_split_on=headers_to_split_on, strip_headers=False
# )
# md_header_splits = markdown_splitter.split_text(markdown_document)

# print(md_header_splits)
# print("\n")

In [16]:
from langchain_pinecone import PineconeEmbeddings
import os

model_name = 'multilingual-e5-large'
embeddings = PineconeEmbeddings(
    model=model_name,
    pinecone_api_key=os.environ.get('PINECONE_API_KEY')
)


In [17]:
from pinecone import Pinecone, ServerlessSpec
import time

pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)

index_name = "rag-opt-sementic"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=embeddings.dimension,
        metric="cosine",
        spec=spec
    )
    # Wait for index to be ready
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# See that it is empty
print("Index before upsert:")
print(pc.Index(index_name).describe_index_stats())
print("\n")



Index before upsert:
{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'opt': {'vector_count': 5}},
 'total_vector_count': 5}




In [18]:
from langchain_pinecone import PineconeVectorStore

namespace = "opt"
for chunks in chunk_docs:
    docsearch = PineconeVectorStore.from_documents(
    documents=chunks,
    index_name=index_name,
    embedding=embeddings,
    namespace=namespace
     )

time.sleep(5)

# See how many vectors have been upserted
print("Index after upsert:")
print(pc.Index(index_name).describe_index_stats())
print("\n")
time.sleep(2)


Index after upsert:
{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'opt': {'vector_count': 41}},
 'total_vector_count': 41}




In [22]:
index = pc.Index(index_name)
namespace = "opt"

for ids in index.list(namespace=namespace):
    query = index.query(
        id=ids[20], 
        namespace=namespace, 
        top_k=1,
        include_values=True,
        include_metadata=True
    )
    print(query)
    print("\n")


{'matches': [{'id': 'a1781e0d-cfd7-4451-8936-5621b2aaf04f',
              'metadata': {'text': '# Navigating the Three-Part Interview '
                                   'Process\n'
                                   '\n'
                                   'The interview process has remained '
                                   'relatively unchanged for generations, and '
                                   'while it continues to be a crucial part of '
                                   'hiring, understanding the structure can '
                                   'significantly help job seekers know what '
                                   'to expect. More importantly, it helps you '
                                   'know when to emphasize specific aspects of '
                                   'your background. There’s often a '
                                   'misconception that if you impress the HR '
                                   'team, you’re well on your way to landin

In [28]:
from langchain_openai import ChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain import hub

retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
retriever=docsearch.as_retriever()

llm = ChatOpenAI(
    openai_api_key=os.environ.get('OPENAI_API_KEY'),
    model_name='gpt-3.5-turbo',
    temperature=0.5,
)

combine_docs_chain = create_stuff_documents_chain(
    llm, retrieval_qa_chat_prompt
)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

query1 = "Do not say you do not know, "+input("Ask a question: ")

answer1_with_knowledge = retrieval_chain.invoke({"input": query1})

print("Answer with knowledge:\n\n", answer1_with_knowledge['answer'])
print("\nContext used:\n\n", answer1_with_knowledge['context'])
print("\n")

# markdown_content = f"""
# # Query: {answer1_with_knowledge['input']}

# ## Answer with Knowledge:
# {answer1_with_knowledge['answer']}

# ## Context:
# """
# markdown_content += answer1_with_knowledge['context']
# with open("answer1_with_knowledge.md", "w") as file:
#     file.write(markdown_content)

# time.sleep(2)

answer1_without_knowledge = llm.invoke(query1)

print("Query 1:", query1)
print("\nAnswer without knowledge:\n\n", answer1_without_knowledge.content)
print("\n")
time.sleep(2)









Answer with knowledge:

 Based on the information provided, it's highlighted that a survey conducted by Clarify Capital in October 2023 found that 50% of hiring managers admitted to creating job postings not to fill immediate vacancies, but rather to build a “warm talent pool.” This suggests that a significant portion of job postings on platforms like LinkedIn may not be for immediate or real job openings, but rather placeholders for future needs.

Context used:

 [Document(id='0f9a3bc9-2042-48b6-84da-a0d9a42b04c0', metadata={}, page_content='# LinkedIn Shows Plenty of Job Openings, But Are They Truly Available? Working in HR for the past six years, I’ve seen firsthand how LinkedIn has become a go-to platform for job seekers and employers alike. It’s a powerful tool for networking, sharing, and discovering job opportunities—arguably one of the top places to find jobs today. However, while LinkedIn is great for visibility and initial contact, it’s not always the best platform for comple