In [None]:
import sys
if '/Users/nehiljain/code/find-your-mate-ai/src' not in sys.path:
    sys.path.append('/Users/nehiljain/code/find-your-mate-ai/src')


import nest_asyncio
nest_asyncio.apply()
from find_your_mate_ai.config import settings
from find_your_mate_ai.data_ingestion import *
import pandas as pd
openai.api_key = settings.OPENAI_API_KEY
logging.info("OpenAI API key configured")


# nodes = load_nodes_from_mongodb(settings.MONGO_URI)


In [None]:
# setup Phoenix
import phoenix as px
import llama_index.core

px.launch_app()
llama_index.core.set_global_handler("arize_phoenix")

In [None]:
import os
from find_your_mate_ai.config import settings
from pinecone import Pinecone
from pinecone import ServerlessSpec
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.pinecone import PineconeVectorStore

api_key = settings.PINECONE_API_KEY
pc = Pinecone(api_key=api_key)

# delete if needed
# pc.delete_index("test-index")

# Dimensions are for text-embedding-ada-002
try:
    pc.create_index(
        "test-index",
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-west-2"),
    )
except Exception as e:
    # Most likely index already exists
    print(e)
    pass

pinecone_index = pc.Index("test-index")
vector_store = PineconeVectorStore(
    pinecone_index=pinecone_index,
    namespace="test",
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)


In [None]:
from pathlib import Path
nodes = ingest_profiles_data(
  source_data_path=Path("/Users/nehiljain/code/find-your-mate-ai/tests/data"),
  output_data_path=Path("/Users/nehiljain/code/find-your-mate-ai/tests/data")
)
# import pickle
# import openai
# openai.api_key = settings.OPENAI_API_KEY
# logging.info("OpenAI API key configured")
# nodes = pickle.load(open("/Users/nehiljain/code/find-your-mate-ai/src/experiments/test-nodes.pkl", "rb"))



In [None]:
nodes[0].metadata

In [None]:
from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.vector_stores import MetadataInfo, VectorStoreInfo
from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.vector_stores import MetadataInfo, VectorStoreInfo

index = VectorStoreIndex(nodes, storage_context=storage_context)
vector_store_info = VectorStoreInfo(
    content_info="founders profiles from startup school YC platform",
    metadata_info=[
        MetadataInfo(
            name="name",
            type="str",
            description="Name of the founder profile",
        ),
        MetadataInfo(
            name="profile_url",
            type="str",
            description="Startup school URL to the founder's profile",
        ),
        MetadataInfo(
            name="linkedin_url",
            type="str",
            description="URL to the founder's LinkedIn profile",
        ),
        MetadataInfo(
            name="hobbies",
            type="list[str]",
            description="List of hobbies/interests of the founder. This is not directly related to startup but the individual founder personalities",
        ),
        MetadataInfo(
            name="employement_industries",
            type="list[str]",
            description="List of industries the founder has worked in."
        ),
        MetadataInfo(
            name="location",
            type="str",
            description="Location of the founder. Format is 'city, state, country'",
        ),
        MetadataInfo(
            name="age",
            type="int",
            description="Age of the founder",
        ),
    ],
)

gpt4 = OpenAI("gpt-4")
retriever = VectorIndexAutoRetriever(
    index,
    vector_store_info=vector_store_info,
    llm=gpt4,
    empty_query_top_k=10,
    top_k=5,
    # this is a hack to allow for blank queries in pinecone
    default_empty_query_vector=[0] * 1536,
    verbose=True,
)

In [None]:

filtered_nodes = retriever.retrieve(
    "Who are some founders that are 30 or below"
)

In [None]:
from llama_index.core.response.notebook_utils import display_source_node

filtered_nodes = retriever.retrieve(
    "Who are some founders that are 30 or below"
)
for node in filtered_nodes:
    print(node.metadata['file_name'])
    display_source_node(node, source_length=1000)
    print("-"*100)
    print("-"*100)


In [None]:
from llama_index.core.response.notebook_utils import display_source_node

filtered_nodes = retriever.retrieve(
    "Who are some founders that are in Seattle, WA, USA"
)
for node in filtered_nodes:
    print(node.metadata['file_name'])
    display_source_node(node, source_length=1000)
    print("-"*100)
    print("-"*100)


In [None]:
from llama_index.core.response.notebook_utils import display_source_node

filtered_nodes = retriever.retrieve(
    "Who are some founders that went to IIT"
)
for node in filtered_nodes:
    print(node.metadata['file_name'])
    display_source_node(node, source_length=1000)
    print("-"*100)
    print("-"*100)
