In [17]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import openai

In [9]:
# Connect to an Index 
import time
from dotenv import load_dotenv, find_dotenv
import os
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings


_ = load_dotenv(find_dotenv()) # read local .env file

pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
spec=ServerlessSpec(cloud='aws', region='us-east-1')

# Initialize the embedding model
embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])


INDEX_NAME = "opt-imize"

if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        metric="cosine",
        dimension=1536,
        spec=spec,
    )
    # Wait for index to be ready
    while not pc.describe_index(INDEX_NAME).status['ready']:
        time.sleep(1)

    

index = pc.Index(INDEX_NAME)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

In [10]:

# Define the directory containing your Markdown files
directory_path = '/Users/feng/Documents/Lab/Optimzer/OPT-imize_v2/info_sourse'

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)

# Iterate over each Markdown file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.md'):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()


        

        # Split content into chunks
        chunks = text_splitter.split_text(content)

        # Generate embeddings for each chunk
        vectors = []
        for i, chunk in enumerate(chunks):
            vector = embeddings.embed_documents([chunk])[0]
            metadata = {'filename': filename, 'chunk': i}
            vectors.append((f'{filename}-{i}', vector, metadata))

        # Upsert vectors into Pinecone
        index.upsert(vectors)


In [11]:
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 102}},
 'total_vector_count': 102}


In [19]:
# Define your query
query = "how many fake jobs on LinkedIn."

# Convert the query into a numerical vector that Pinecone can search with
# query_embedding = pc.openai.ebemd(
#     model="multilingual-e5-large",
#     inputs=[query],
#     parameters={
#         "input_type": "query"
#     }
# )

def embed(docs: list[str]) -> list[list[float]]:
    res = openai.embeddings.create(
        input=docs,
        model="text-embedding-3-small"
    )
    doc_embeds = [r.embedding for r in res.data] 
    return doc_embeds 

query_embedding = embed([query])

# Search the index for the three most similar vectors
results = index.query(    
    vector=query_embedding[0],
    top_k=3,
    include_values=False,
    include_metadata=True
)


print(results)


{'matches': [{'id': 'Creating an ATS-Friendly Resume.md-1',
              'metadata': {'chunk': 1.0,
                           'filename': 'Creating an ATS-Friendly Resume.md'},
              'score': 0.043776445,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': 'Creating an ATS-Friendly Resume.md-7',
              'metadata': {'chunk': 7.0,
                           'filename': 'Creating an ATS-Friendly Resume.md'},
              'score': 0.04137156,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': 'Creating an ATS-Friendly Resume.md-10',
              'metadata': {'chunk': 10.0,
                           'filename': 'Creating an ATS-Friendly Resume.md'},
              'score': 0.0411885,
              'sparse_values': {'indices': [], 'values': []},
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}
