### 1. Import import libs, set api keys

In [12]:
import openai
import os
openai.api_key = os.environ["OPENAI_API_KEY"]

In [4]:
embed_model = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "It is a priviliage to be study in Peking University",
        "Math is a very hard subject"
    ], engine=embed_model
)

In [5]:
len(res['data'][0]['embedding'])

1536

In [6]:
res.keys()
type(res['data'][0]['embedding'])

list

### Now we iniate the vector database via Pinecone API

In [7]:
import pinecone

index_name = 'notion-database'

# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key=os.environ["PINECONE_API_KEY"],
    environment="us-east1-gcp"
)

# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']),
        # creates an idex of dimension 1536
        metric='cosine',
        # metric='euclidean',
        metadata_config={'indexed': ['channel_id', 'published']}
    )
# connect to index
index = pinecone.Index(index_name)
# view index stats
index.describe_index_stats()

# an error was met and solved upon retring && upgrading jupyter notebook with `pip install notebook --upgrade`

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Preparing the Data/Vectors

In [8]:
"""This is the logic for ingesting Notion data into LangChain."""
from pathlib import Path
from langchain.text_splitter import CharacterTextSplitter
# import faiss
# from langchain.vectorstores import FAISS
# from langchain.embeddings import OpenAIEmbeddings
# import pickle

from tqdm.auto import tqdm

# Here we load in the data in the format that Notion exports it in.
ps = list(Path("Notion_DB/").glob("**/*.md"))

data = []
sources = []
for p in ps:
    with open(p) as f:
        data.append(f.read())
    sources.append(p)


# here the content of each .md file is stored as an element in the list `data`, which is very huge
# and the path of each .md file is stored as an element in the list `sources`

## Next we split the documents, as needed, into smaller chunks.

In [9]:
# We do this due to the context limits of the LLMs.

# chunk size is 1000, which means each chunk of text will be 1000 characters long, and that the separator is a new line
text_splitter = CharacterTextSplitter(chunk_size=1000, separator="\n")
docs = []
metadatas = []
for i, d in enumerate(data):
    # where i, d is the index and content of each .md file respectively
    splits = text_splitter.split_text(d)
    docs.extend(splits)
    metadatas.extend([{"source": sources[i]}] * len(splits))
print(len(docs))

# question, will the data be too big/unspecific for each chunk?
# now len(docs) should be the number of vectors this is going to create

214


## Next, we vectorize the docs using the embedding model and then upload them into Pinecone

it seems to me that we don't really need such a vector database so powerful as Pinecone, any vector database should suffice.

In [15]:
from tqdm.auto import tqdm
import time

# batch_size = 100  # how many embeddings we create and insert at once
vector_lists = []
id_batch = [x for x in range(0, len(docs))]
coord_list = []

for i in tqdm(range(0, len(docs))):
    if i!=0 and i%60==0:
        time.sleep(60)
        
    # get texts to encode
    texts = docs[i]
    # create embeddings (try-except added to avoid RateLimitError)
    res = openai.Embedding.create(input=texts, engine=embed_model)
    
    coord = res['data'][0]['embedding'] # the coordinates of the vector, in semantic or embedding space.
    coord_list.append(coord)
    # no metadata is added in this version

    
# print(vector_lists) # use this line with caution, its VERY long (dim * number of vector, a list of 153,600 elements)

  0%|          | 0/214 [00:00<?, ?it/s]

In [17]:
# to fix a bug in the above cell
id_batch = [str(x) for x in range(0, len(docs))]
vectors = list(zip(id_batch, coord_list))
index.upsert(vectors)

{'upserted_count': 214}

okay we forgot to upload the vectors to Pinecone lol

## And Now, We CAN FINALLY QUERY THE DATABASE!!

In [18]:

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 214}},
 'total_vector_count': 214}

In [52]:
query = "What is attachment style?"

query_emb = openai.Embedding.create(
    input=[query],
    engine=embed_model
)

# retrieve from Pinecone
query_coord = query_emb['data'][0]['embedding']

# get relevant contexts (including the questions)
query_res = index.query(query_coord, top_k=3, include_metadata=True)

In [53]:
print(query_res)

{'matches': [{'id': '2', 'score': 0.853277624, 'values': []},
             {'id': '1', 'score': 0.851444602, 'values': []},
             {'id': '13', 'score': 0.849302828, 'values': []}],
 'namespace': ''}


In [54]:
content_ids = [
        int(x['id']) for x in query_res['matches']
    ]
print(content_ids)

[2, 1, 13]


In [55]:
contents = [docs[i] for i in content_ids]
contents_str = "\n\n".join(contents)

In [56]:
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain

prompt = PromptTemplate(
    input_variables=["question","contents"],
    template=''' Answer this question: "{question}" using the contents below
    Contents:
    {contents}
    Answer:
    ''',
)

llm=OpenAI(temperature=0.9)

chain = LLMChain(llm=llm, prompt=prompt)

answer = chain.run(question=query,contents=contents_str)

with open ("answer.txt", "w") as f:
    f.write(answer)



InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 4946 tokens (4690 in your prompt; 256 for the completion). Please reduce your prompt; or completion length.

In [57]:
with open ("contents.txt", "w") as h:
    h.write(contents_str)