### 1. Import import libs, set api keys

In [1]:
import openai
import os
openai.api_key = os.environ["OPENAI_API_KEY"]

In [2]:
embed_model = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "It is a priviliage to be study in Peking University",
        "Math is a very hard subject"
    ], engine=embed_model
)

In [83]:
len(res['data'][0]['embedding'])

ApiAttributeError: QueryResponse has no attribute 'data' at ['['received_data']']['data']

In [82]:
res.keys()
res['data'][0]['embedding']

TypeError: 'NoneType' object is not callable

### Now we iniate the vector database via Pinecone API

In [8]:
import pinecone

index_name = 'notion-database'

# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key=os.environ["PINECONE_API_KEY"],
    environment="us-east1-gcp"
)

# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']),
        # creates an idex of dimension 1536
        metric='cosine',
        # metric='euclidean',
        metadata_config={'indexed': ['channel_id', 'published']}
    )
# connect to index
index = pinecone.Index(index_name)
# view index stats
index.describe_index_stats()

# an error was met and solved upon retring && upgrading jupyter notebook with `pip install notebook --upgrade`

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Preparing the Data/Vectors

In [14]:
"""This is the logic for ingesting Notion data into LangChain."""
from pathlib import Path
from langchain.text_splitter import CharacterTextSplitter
# import faiss
# from langchain.vectorstores import FAISS
# from langchain.embeddings import OpenAIEmbeddings
# import pickle

from tqdm.auto import tqdm

# Here we load in the data in the format that Notion exports it in.
ps = list(Path("Notion_DB/").glob("**/*.md"))

data = []
sources = []
for p in ps:
    with open(p) as f:
        data.append(f.read())
    sources.append(p)


# here the content of each .md file is stored as an element in the list `data`, which is very huge
# and the path of each .md file is stored as an element in the list `sources`

## Next we split the documents, as needed, into smaller chunks.

In [61]:
# We do this due to the context limits of the LLMs.

# chunk size is 1000, which means each chunk of text will be 1000 characters long, and that the separator is a new line
text_splitter = CharacterTextSplitter(chunk_size=1000, separator="\n")
docs = []
metadatas = []
for i, d in enumerate(data):
    # where i, d is the index and content of each .md file respectively
    splits = text_splitter.split_text(d)
    docs.extend(splits)
    metadatas.extend([{"source": sources[i]}] * len(splits))
print(len(docs))

# question, will the data be too big/unspecific for each chunk?
# now len(docs) should be the number of vectors this is going to create

214


## Next, we vectorize the docs using the embedding model and then upload them into Pinecone

it seems to me that we don't really need such a vector database so powerful as Pinecone, any vector database should suffice.

In [84]:
from tqdm.auto import tqdm
import datetime
from time import sleep

# batch_size = 100  # how many embeddings we create and insert at once
vector_lists = []
id_batch = [x for x in range(0, len(docs))]
for i in tqdm(range(0, len(docs))):
    # get texts to encode
    texts = docs[i]
    # create embeddings (try-except added to avoid RateLimitError)
    res = openai.Embedding.create(input=texts, engine=embed_model)
    
    # try:
    #     res = openai.Embedding.create(input=texts, engine=embed_model)
    # except:
    #     done = False
    #     while not done:
    #         sleep(5)
    #         try:
    #             res = openai.Embedding.create(input=texts, engine=embed_model)
    #             done = True
    #         except:
    #             pass
    embed = [res['data'][0]['embedding']] # the coordinates of the vector, in semantic or embedding space.
    
    vector = list(zip(id_batch, embed))
    # no metadata is added in this version
    vector_lists.append(vector)
# print(vector_lists) # use this line with caution, its VERY long (dim * number of vector, a list of 153,600 elements)

100%|██████████| 214/214 [08:25<00:00,  2.36s/it]  


In [None]:
# index.upsert([
#     ("A", [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]),
#     ("B", [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]),
#     ("C", [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]),
#     ("D", [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]),
#     ("E", [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5])
# ])

okay we forgot to upload the vectors to Pinecone lol

In [121]:
# len(vector_lists[200][0][1])
# vector_lists[200][0][1]
# vector_lists[200][0][1]

# index.upsert(vector_lists)
for i in range(len(vector_lists)):
    index.upsert((i, vector_lists[i][0][1]))

ValueError: Invalid vector value passed: cannot interpret type <class 'int'>

## And Now, We CAN FINALLY QUERY THE DATABASE!!

In [107]:

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [67]:
query = "What is attachment style?"

query_emb = openai.Embedding.create(
    input=[query],
    engine=embed_model
)

# retrieve from Pinecone
query_coord = query_emb['data'][0]['embedding']

# get relevant contexts (including the questions)
query_res = index.query(query_coord, top_k=10, include_metadata=True)

In [68]:
print(query_res)

{'matches': [], 'namespace': ''}
