### Setup

In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
import os
import pinecone
from dotenv import load_dotenv
from uuid import uuid4
from langchain.vectorstores import Pinecone

# load enviroment variables
load_dotenv()

  from tqdm.autonotebook import tqdm


True

### Build vectorstore

In [3]:
# local directory
workout_data_path = r"..\workout_data"
workout_data_files = [os.path.join(workout_data_path, f) for f in os.listdir(workout_data_path)]

# print(len(workout_data_files))
# print(workout_data_files)

In [12]:
docs = []
base_url = "https://www.muscleandstrength.com/sites/default/files/workouts/"

# read and store the content of each document
for f in workout_data_files:
    loader = PyPDFLoader(f)
    pages = loader.load()
    docs.extend(pages)

# add url and title as metadata to each document
for d in docs:
    d.metadata["source"] = base_url + d.metadata["source"].split("\\")[-1]
    d.metadata["title"] = d.metadata["source"].split("/")[-1][:-4]

In [15]:
# for d in docs:
#     print(d)

In [17]:
# initilize OpenAI's embedding model
embedding_model_name = "text-embedding-ada-002"

embedding_model = OpenAIEmbeddings(
    model=embedding_model_name,
    openai_api_key=os.getenv("OPENAI_API_KEY")
)

In [50]:
embedding = embedding_model.embed_documents([docs[0].page_content])
len(embedding), len(embedding[0])

(1, 1536)

In [36]:
# initilize pinecone

pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENVIRONMENT")
)

# check active indexes
active_indexes = pinecone.list_indexes()
print(active_indexes)

[]


In [37]:
# set up a new index
index_name = "workouts"

# skip if the index already exists
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        metric="cosine",
        dimension=len(embedding[0])  # 1536 dim of text-embedding-ada-002
    )

In [40]:
# load the newly create index
# it might return an error signaling the index is not ready yet
index = pinecone.GRPCIndex(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [48]:
# prepare text and metadata
texts = []
metadatas = []

for doc in tqdm(docs):
    metadata = {
        "text": doc.page_content,
        "title": doc.metadata["title"],
        "source": doc.metadata["source"]
    }

    metadatas.append(metadata)
    texts.append(doc.page_content)
    
print(len(metadatas), len(texts))

# generate an ID for each document
ids = [str(uuid4()) for _ in range(len(texts))]

# embed document
text_embeddings = embedding_model.embed_documents(texts)
print(len(text_embeddings), len(text_embeddings[0]))

  0%|          | 0/39 [00:00<?, ?it/s]

39 39
39 1536


In [51]:
# add data to the vectorstore
index.upsert(vectors=zip(ids, text_embeddings, metadatas))

upserted_count: 39

In [52]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 39}},
 'total_vector_count': 39}

### Query vectorstore

In [62]:
# set up field to return as "page_content"
text_field = "text"

# load vectorstore
index = pinecone.Index(index_name)
vectorstore = Pinecone(index, embedding_model, text_field)

In [63]:
# retrieve similar content
query = "legs workout"

vectorstore.similarity_search(query, k=3)

[Document(page_content='Exercise Sets Reps\nAb Crunch 3 15\nLying Leg Raise 3 15\nSide Oblique Crunch (Each Side) 3 15 Each\nGlute Kick Back 3 15\nDumbbell Romanian Deadlift 3 15\nReverse Lunge 3 15\nExercise Sets Reps\nDumbbell Squat 3 15\nDumbbell Lunge (Each Side) 3 15 Each\nDumbbell Lying Leg Curl (On The Floor) 3 15\nBodyweight Single Leg Deadlift 3 15\nSeated Calf Raise 3 15\nStanding Calf Raise 3 15\nExercise Sets Reps\nDumbbell Bench Press (On The Floor) 3 15\nBent-Over Dumbbell Row 3 15\nDumbbell Pullover 3 15\nLateral Raise 3 15\nLying Dumbbell Extension 3 15\nHammer Dumbbell Curl 3 15\nMUSCLEANDSTRENGTH.COM\n3 DAY FULL BODY WOMEN’S\nDUMBBELL ONLY WORKOUT\nMain Goal:  Build Muscle\nTraining Level:  Beginner  \nDays Per Week: 3 Days\nProgram Duration:  8 Weeks\nClick here for the full workout!  Equipment:  Bodyweight, Dumbbells\nTarget Gender:  Female\nAuthor:  Roger “Rock” Lockridge\nDay 1 - Abs/Glutes\nWorkout\nSummary\nDay 2 - Lower Body\nDay 3 - Upper Body', metadata={'sou