-
Notifications
You must be signed in to change notification settings - Fork 3
/
CreateVectorDB.py
96 lines (74 loc) · 2.48 KB
/
CreateVectorDB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import pandas as pd
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
import os
import time
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm
# get API key from top-right dropdown on OpenAI website
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
data=pd.read_pickle('dataframe.pickle')
model_name = 'text-embedding-ada-002'
embed = OpenAIEmbeddings(
model=model_name,
openai_api_key=OPENAI_API_KEY
)
# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.getenv("PINECONE_API_KEY")
# configure client
pc = Pinecone(api_key=api_key)
spec = ServerlessSpec(
cloud="aws", region="us-west-2"
)
### INITIALIZE AN EMBEDDING INDEX IN PINECONE ##
index_name = "shopassist-ada-002-v2" #"shopassist-ada-002-v1"
existing_indexes = [
index_info["name"] for index_info in pc.list_indexes()
]
# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
# if does not exist, create index
pc.create_index(
index_name,
dimension=1536, # dimensionality of ada 002
metric='cosine',
spec=spec
)
# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
time.sleep(1)
# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()
### END INITIALIZE AN EMBEDDING INDEX IN PINECONE ##
### LOAD DATA INTO THE PINECONE INDEX ##
data.drop_duplicates(subset=['post_id'],inplace=True)
data=data[data["transcript"].notna()]
batch_size = 100
texts = []
metadatas = []
for i in tqdm(range(0, len(data), batch_size)):
# get end of batch
i_end = min(len(data), i+batch_size)
batch = data.iloc[i:i_end]
# first get metadata fields for this record
metadatas = [{
'creatorid': record['creatorid'],
'post_id': record['post_id'],
'thumbnail_url': record['thumbnail_url'],
'date_utc': record['date_utc'].date(),
'caption_hashtags': record['caption_hashtags'],
'text': record['transcript']
} for j, record in batch.iterrows()]
# get the list of contexts / documents
documents = batch['transcript']
# create document embeddings
embeds = embed.embed_documents(documents)
# get IDs
ids = batch['post_id']
# add everything to pinecone
index.upsert(vectors=zip(ids, embeds, metadatas))
index.describe_index_stats()
### END LOAD DATA INTO THE PINECONE INDEX ##