In [1]:
# Load environment variables

from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv("/Users/apple/Documents/LLM/.env"))

True

In [2]:
from langchain import OpenAI
from langchain import PromptTemplate
import os

# Loaders
from langchain.schema import Document

# Splitters
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Model
from langchain.chat_models import ChatOpenAI

# Embedding Support
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

# Summarizer we'll use for Map Reduce
from langchain.chains.summarize import load_summarize_chain

# Data Science
import numpy as np
from sklearn.cluster import KMeans

In [17]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", "\t", " ", ""], chunk_size=2000, chunk_overlap=2000)
book_text = open("/Users/apple/Documents/LLM/book.txt", "r").read()
docs = text_splitter.create_documents([book_text])

In [18]:
num_documents = len(docs)

print (f"Now our book is split up into {num_documents} documents")

Now our book is split up into 514 documents


In [19]:
map_prompt = """
You will be given a single passage of a book on physiology. This section will be enclosed in triple backticks (```).
You are a physiology student learning about physiology and your goal is to distill the knowledge in the passage into its key point(s) to be used later.
You are specifically interested in numbers, equations, and metrics to be used as guides.
Output these points as a bullet point list.

```{text}```
FULL SUMMARY:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

In [20]:
llm3 = ChatOpenAI(temperature=0,
                 max_tokens=400,
                 model='gpt-3.5-turbo'
                )

In [21]:
map_chain = load_summarize_chain(llm=llm3,
                             chain_type="stuff",
                             prompt=map_prompt_template)

In [23]:
import concurrent.futures

# Function to get a summary of the chunk using map_chain.run
def get_chunk_summary(doc):
    return map_chain.run([doc])

# Make an empty list to hold your summaries
summary_list = []

# Function to process the documents concurrently
def process_docs_concurrently(docs):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit the tasks to the executor and store the future objects
        future_to_doc = {executor.submit(get_chunk_summary, doc): doc for doc in docs}
        for future in concurrent.futures.as_completed(future_to_doc):
            doc = future_to_doc[future]
            try:
                chunk_summary = future.result()
                # Append that summary to your list
                summary_list.append(chunk_summary)
            except Exception as e:

# Call the function to process the documents concurrently
process_docs_concurrently(docs)

Summary for doc page_content='THE MUSCLE & STRENGTH PYRAMID: NUTRITION\nstress you out eventually or at the very least take your time and energy away from more important things in life.\nWhat will stress you out even more than the process of trying to be overly detailed and accurate is what happens once you run out of energy to do this, and you can’t do it consistently anymore. Then, you start to bounce back between the extremes of losing control completely and overeating, and rigidly tracking until you lose it again. Living in the two extremes is something to avoid, and to do so we really want to make sure that we have a balance of these three factors.\nWe want to be only as accurate as we need to be in order to be consistent enough that we can get to our goals. Different goals are going to require different levels of accuracy, but they all require consistency, which means adapting your flexibility to your goal. In research, dietary restraint is highly associated with folks who can lo

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Thu, 03 Aug 2023 13:25:01 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7f0ee24c6f248e4a-TLV', 'alt-svc': 'h3=":443"; ma=86400'}.


Summary for doc page_content='To sum things up, you should really only be doing an extended diet to drop a weight class (acutely cutting 2% of your body weight or less is a different story) if you are a high-level lifter and doing so would give you a realistic shot at a national or international record or title, or qualify you for international competition.\nBut what about going up a weight class?\nWell, greater levels of muscle mass are highly associated with being stronger and having greater competitive success in powerlifting [20]. Meaning, progressing in your powerlifting career should come with hypertrophy. For those who began with low to moderate levels of body fat, this will often mean going up 1–2 weight classes over one’s career. How much weight you gain and thus, how many classes you end up competing in over the course of your career also depends on what age you began competing, and your genetic propensity for putting on muscle. Lastly, it also depends on whether you tend to 

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=60).


Summary for doc page_content='As discussed in the preface of this book, the Muscle and Strength Nutrition Pyramid was originally presented as a video series that I created for the 3D Muscle Journey YouTube channel back in 2013 (with the Training Pyramid in 2015). Many of the ideas are similar, and watching those videos may be useful for some people’s absorption of the material. However, be weary in that some of the information may be outdated, which is why I have created this updated guide with current recommendations in accordance with more recent scientific findings.\nContributors To The Pyramids\nHere are the links to access other resources available from myself, my coaching team, and my co-authors, Andrea and Andy. Without these ideas, videos and people, the creation of this pair of books would not have been possible.\nRESOURCES FROM 3D MUSCLE JOURNEY\nThe home base for all 5 of the crew from Team 3D Muscle Journey. Here you can find our podcast, blog articles, videos, and informat

In [41]:
bullets = []

for i, doc in enumerate(docs):
    for bullet in summary_list[i].split('-'):
        if len(bullet) > 0:
            bullet_doc =  Document(page_content=bullet.rstrip().strip(), metadata={"chunk": i})
            bullets.append(bullet_doc)

bullets

[Document(page_content='Accuracy and consistency are important factors in nutrition for achieving goals.', metadata={'chunk': 0}),
 Document(page_content='Different goals require different levels of accuracy.', metadata={'chunk': 0}),
 Document(page_content='Flexible dietary restraint is associated with successful weight loss and maintenance.', metadata={'chunk': 0}),
 Document(page_content='The level of accuracy needed depends on the specific situation and goal.', metadata={'chunk': 0}),
 Document(page_content='Consistency is important in achieving goals', metadata={'chunk': 1}),
 Document(page_content='Different goals require different levels of accuracy', metadata={'chunk': 1}),
 Document(page_content='Flexible dietary restraint is associated with successful weight loss and maintenance', metadata={'chunk': 1}),
 Document(page_content='The appropriate amount of flexibility depends on the situation and goal', metadata={'chunk': 1}),
 Document(page_content='Accuracy can lead to more co

In [42]:
# Import and instantiate OpenAI embeddings

from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [43]:
# Import and initialize Pinecone client

import os
import pinecone
from langchain.vectorstores import Pinecone


pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  
    environment=os.getenv('PINECONE_ENV')  
)

In [45]:
# Upload vectors to Pinecone

index_name = "physiology-books-kb"
search = Pinecone.from_documents(bullets, embeddings, index_name=index_name)

In [46]:
# Do a simple vector similarity search

query = "What is muscle protein synthesis?"
result = search.similarity_search(query)

result

[Document(page_content='Muscle protein breakdown and muscle protein synthesis determine whether or not lean tissue is gained.', metadata={'chunk': 108.0}),
 Document(page_content='Muscle protein breakdown and synthesis determine whether lean tissue is gained or lost.', metadata={'chunk': 110.0}),
 Document(page_content='related differences in muscle protein synthesis in response to resistance exercise.', metadata={'chunk': 152.0}),
 Document(page_content='It is unclear whether the lower muscle protein synthesis rates during an energy deficit can be overcome by additional dietary protein or if it simply takes more protein to stimulate muscle protein synthesis to the same degree as in a surplus.', metadata={'chunk': 117.0})]