In [None]:
import sys
if '/Users/nehiljain/code/find-your-mate-ai/src' not in sys.path:
    sys.path.append('/Users/nehiljain/code/find-your-mate-ai/src')


import nest_asyncio
nest_asyncio.apply()
from find_your_mate_ai.config import settings
from find_your_mate_ai.data_ingestion import *
import pandas as pd
openai.api_key = settings.OPENAI_API_KEY
logging.info("OpenAI API key configured")


nodes = load_nodes_from_mongodb(settings.MONGO_URI)


In [None]:
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from textwrap import dedent
# build index
index = VectorStoreIndex(nodes)


query_engine = index.as_query_engine()
response = query_engine.query(
    dedent("""Create a markdown style list of all the profiles that match the following query best:
           Find me all the founders in SF Bay Area location
           """)
)
print(response)



In [None]:
from llama_index.core.response.notebook_utils import display_source_node
for node in response.source_nodes:
    print(node.metadata['file_name'])
    display_source_node(node, source_length=1000)
    print("-"*100)
    print("-"*100)


In [None]:

from llama_index.core.retrievers import KeywordTableGPTRetriever

from textwrap import dedent
# build index
index = VectorStoreIndex(nodes)


query_engine = index.as_retriever()
response = query_engine.query(
    dedent("""Create a markdown style list of all the profiles that match the following query best:
           Find me all the founders in SF Bay Area location
           """)
)
print(response)



In [None]:
from pymongo import MongoClient
from pymongo.server_api import ServerApi

# Connect to MongoDB
client = MongoClient(settings.MONGO_URI, server_api=ServerApi('1'))
db = client['db_docstore']
collection = db['docstore/data']

def update_document_ids_with_hashes(metadata_df):
    metadata_df = fetch_all_documents_from_mongodb(settings.MONGO_URI, "db_docstore", "docstore/data")
    # Create a clone of the metadata DataFrame
    cloned_df = metadata_df.copy()

    # Modify the _id column to be a hash of the file_name column
    cloned_df['_id'] = cloned_df['file_name'].apply(lambda x: hash(x))

    # Iterate over each row in the DataFrame and update the MongoDB document
    for index, row in cloned_df.iterrows():
        original_id = metadata_df.at[index, '_id']
        new_id = row['_id']

        # Update the document in MongoDB
        update_result = collection.update_one(
            {'_id': original_id},
            {'$set': {'_id': new_id}}
        )

        # Log the result of the update
        if update_result.modified_count == 1:
            logging.info(f"Document with original ID {original_id} updated to new ID {new_id}")
        else:
            logging.error(f"Failed to update document with original ID {original_id}")

# Call the function with the metadata DataFrame
update_document_ids_with_hashes(metadata_df)


# Close the connection
client.close()


In [None]:
%pip install llama-index-llms-openai
%pip install llama-index-program-openai
%pip install llama-index-readers-web


In [None]:
# load in blog

from llama_index.readers.web import SimpleWebPageReader
from llama_index.core.node_parser import SentenceSplitter

reader = SimpleWebPageReader(html_to_text=True)
docs = reader.load_data(urls=["https://eugeneyan.com/writing/llm-patterns/"])

In [None]:
from llama_index.core.ingestion import IngestionPipeline

node_parser = SentenceSplitter(chunk_size=1024)

pipeline = IngestionPipeline(transformations=[node_parser, program_extractor])

orig_nodes = pipeline.run(documents=docs)

In [None]:
orig_nodes

In [None]:
# setup Phoenix
import phoenix as px
import llama_index.core

px.launch_app()
llama_index.core.set_global_handler("arize_phoenix")

In [None]:
import os
from find_your_mate_ai.config import settings
from pinecone import Pinecone
from pinecone import ServerlessSpec
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.pinecone import PineconeVectorStore

api_key = settings.PINECONE_API_KEY
pc = Pinecone(api_key=api_key)

# delete if needed
# pc.delete_index("test-index")

# Dimensions are for text-embedding-ada-002
try:
    pc.create_index(
        "test-index",
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-west-2"),
    )
except Exception as e:
    # Most likely index already exists
    print(e)
    pass

pinecone_index = pc.Index("test-index")
vector_store = PineconeVectorStore(
    pinecone_index=pinecone_index,
    namespace="test",
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(nodes, storage_context=storage_context)

In [None]:
from pathlib import Path
nodes = ingest_profiles_data(
  source_data_path=Path("/Users/nehiljain/code/find-your-mate-ai/tests/data"),
  output_data_path=Path("/Users/nehiljain/code/find-your-mate-ai/tests/data")
)



In [None]:
from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.vector_stores import MetadataInfo, VectorStoreInfo
from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.vector_stores import MetadataInfo, VectorStoreInfo


vector_store_info = VectorStoreInfo(
    content_info="famous books and movies",
    metadata_info=[
        MetadataInfo(
            name="director",
            type="str",
            description=("Name of the director"),
        ),
        MetadataInfo(
            name="theme",
            type="str",
            description=("Theme of the book/movie"),
        ),
        MetadataInfo(
            name="year",
            type="int",
            description=("Year of the book/movie"),
        ),
    ],
)
retriever = VectorIndexAutoRetriever(
    index,
    vector_store_info=vector_store_info,
    empty_query_top_k=10,
    # this is a hack to allow for blank queries in pinecone
    default_empty_query_vector=[0] * 1536,
    verbose=True,
)