# Set up environment


## Install dependencies

In [None]:
!pip install -qU \
    marvin \
    llama-index \
    beautifulsoup4 \
    requests \
    pinecone-client \
    openai \
    llama-index-readers-youtube-transcript

## Set API Keys

In [None]:
# API Keys
import openai
import os
from pinecone import Pinecone, ServerlessSpec
from getpass import getpass
import sys
import os

os.environ["OPENAI_API_KEY"] = getpass("Please enter your OpenAI API key: ")
os.environ["PINECONE_API_KEY"] = getpass("Please enter your Pinecone API key: ")
os.environ["MARVIN_OPENAI_API_KEY"] = getpass("Please enter your Marvin API key: ")
os.environ["CHATPDF_API_KEY"] = getpass("Please enter your ChatPDF API key: ")

# Load and classify the data

## Import Blogs

In [None]:
# Define URLs to scrape
import requests

def generate_urls(base_url, category, start_page=1, max_pages=None):
    urls = []
    page_number = start_page
    while True:
        url = f"{base_url}/category/{category}/page/{page_number}/"
        # Check if the page exists:
        response = requests.get(url)
        if response.status_code == 200:
            urls.append(url)
            if max_pages and len(urls) >= max_pages:
                break
        else:
            break  # Stop if the page does not exist
        page_number += 1
    return urls

def generate_simple_urls(base_url, max_pages=None):
    urls = []
    page_number = 1
    while True:
        url = f"{base_url}page/{page_number}/"
        # Check if the page exists:
        response = requests.get(url)
        if response.status_code == 200:
            urls.append(url)
            if max_pages and len(urls) >= max_pages:
                break
        else:
            break  # Stop if the page does not exist
        page_number += 1
    return urls

# Example usage
base_url_aviva = "https://avivaromm.com"
base_url_brighten = "https://drbrighten.com/articles/"

# Generate URLs while checking for actual page existence
urls_hormone = generate_urls(base_url_aviva, "balance-your-hormones", max_pages=50)
urls_menstrual = generate_urls(base_url_aviva, "menstrual-sexual-health", max_pages=50)
urls_conception = generate_urls(base_url_aviva, "fertility-conception", max_pages=50)
urls_pregnancy = generate_urls(base_url_aviva, "natural-pregnancy", max_pages=50)
urls_thyroid = generate_urls(base_url_aviva, "thyroid-support", max_pages=50)
urls_mood = generate_urls(base_url_aviva, "heal-mind-mood", max_pages=50)
urls_gut = generate_urls(base_url_aviva, "gut-immunity", max_pages=50)
urls_herbal = generate_urls(base_url_aviva, "herbal-medicine", max_pages=50)

urls_brighten = generate_simple_urls(base_url_brighten, max_pages=50)

all_urls = urls_brighten + urls_hormone + urls_menstrual + urls_conception + urls_pregnancy + urls_thyroid + urls_mood + urls_gut + urls_herbal

In [None]:
import requests
from bs4 import BeautifulSoup
from llama_index.core import Document

def scrape_blog_posts(base_url):
    session = requests.Session()
    response = session.get(base_url)
    soup = BeautifulSoup(response.text, "html.parser")
    blogs_documents = []

    for article in soup.find_all("article"):  # Limit the number of articles
        title = article.find("h2").text.strip()
        url = article.find("a")["href"]

        article_response = session.get(url)
        article_soup = BeautifulSoup(article_response.text, "html.parser")
        content = article_soup.find("div", class_="entry-content").text.strip()

        # Creating a new document for this article
        blog = Document(title=title, text=content, metadata={"source": url, "document_type": "professional opinions"})
        blogs_documents.append(blog)  # Use append for single items

    session.close()
    return blogs_documents

blogs_documents = []
for url in all_urls:
    blogs_documents += scrape_blog_posts(url)  # Use += for appending lists


In [None]:
len(blogs_documents)

708

In [None]:
blogs_documents[0].metadata

{'source': 'https://drbrighten.com/endometriosis-relief-naturally/',
 'document_type': 'professional opinions'}

## Import Youtube

In [None]:
from google.colab import drive
import pandas as pd
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
from urllib.parse import quote, urlparse, parse_qs
# Uncomment this line if you're running the code in Google Colab to mount your Google Drive.
# drive.mount('/content/drive')

# Read data from a Google Sheet
sheet_id = '14Y91UTR4VXngNDwAL0gaBaPSSy5kcXNBYiuQ3WQ5VZM'
sheet_name = 'professional opinions' #replace
encoded_sheet_name = quote(sheet_name)  # Encoding the sheet name for URL usage
url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={encoded_sheet_name}'
df = pd.read_csv(url)

# Extract the YouTube URLs from the first column (assuming no header, uncomment the next line if there is a header)
yturls = df.iloc[:, 0].tolist()

# Function to extract video IDs from YouTube URLs
def extract_video_id(url):
    parsed_url = urlparse(url)
    video_id = parse_qs(parsed_url.query).get('v', [None])[0]
    return video_id

video_ids = [extract_video_id(url) for url in yturls]

# Filter out any None values from video_ids
video_ids = [video_id for video_id in video_ids if video_id is not None]

# Correct the creation of YouTube URLs, removing the incorrect escaping of backslashes
ytlinks = [f"https://www.youtube.com/watch?v={video_id}" for video_id in video_ids]


# Collect categorized documents from academic videos
loader = YoutubeTranscriptReader()
youtube_documents = loader.load_data(ytlinks=ytlinks)

In [None]:
youtube_documents[20].metadata

{'video_id': 'Lsp4-JDmW44'}

In [None]:
len(youtube_documents)

122

## Import Papers

In [None]:
import os
import requests
from llama_index.core import Document

# Step 1: Define the directory path where your PDF files are stored
directory_path = os.path.join(os.getcwd(), "academic_papers")

# Step 2: API setup
upload_url = "https://api.chatpdf.com/v1/sources/add-file"
query_url = "https://api.chatpdf.com/v1/chats/message"
api_key = os.environ.get("CHATPDF_API_KEY")

# Step 3: Headers for the API requests
upload_headers = {
    'x-api-key': api_key
}
query_headers = {
    'x-api-key': api_key,
    "Content-Type": "application/json"
}

papers_documents = []

# Step 4: Iterate over PDF files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".pdf"):
        file_path = os.path.join(directory_path, filename)

        # Step 5: Upload the PDF file
        with open(file_path, 'rb') as file:
            files = [
                ('file', (filename, file, 'application/octet-stream'))
            ]
            response = requests.post(upload_url, headers=upload_headers, files=files)
            if response.status_code == 200:
                source_id = response.json()['sourceId']
                print(f'File uploaded successfully. Source ID: {source_id}')

                # Step 6: Prepare and send a single query with multiple requests
                data = {
                    'sourceId': source_id,
                    'messages': [
                        {
                            'role': "user",
                            'content': "give a long summary with key findings, any biology explanations, and implications."
                        }
                    ]
                }
                citation_data = {
                    'sourceId': source_id,
                    'messages': [
                        {
                            'role': "user",
                            'content': "What is the article's MLA citation? Please only provide the citation in the response"
                        }
                    ]
                }

                try:
                    response = requests.post(query_url, headers=query_headers, json=data)
                    citation = requests.post(query_url, headers=query_headers, json=citation_data)

                    if 'content' in response.json() and 'content' in citation.json():
                        papers_documents.append(Document(text=response.json()['content'], metadata={"source": citation.json()['content'], "document_type": "academia"}))
                    else:
                        print(f"Skipping file {filename} due to missing 'content' key in response or citation.")

                except KeyError as e:
                    print(f"KeyError occurred for file {filename}: {str(e)}. Skipping to the next file.")
                    continue

File uploaded successfully. Source ID: src_hNzDcfm9Nw5flDfFAl8Du
File uploaded successfully. Source ID: src_yU0MxZ6DMPgeniCd32CKC
File uploaded successfully. Source ID: src_jgnhyEk0nwXZrQYG09rI4
File uploaded successfully. Source ID: src_beyK6zAxc18J2YmU6VYWx
File uploaded successfully. Source ID: src_n10b5S5ApiOaOgYKoKsu5
File uploaded successfully. Source ID: src_DEpLi9BITny4gfXRZ2iBh
File uploaded successfully. Source ID: src_VZIzK6PuLP0vscP3pumiJ
File uploaded successfully. Source ID: src_OXdpf3HgJDpI8k4lOOoLN
File uploaded successfully. Source ID: src_xofPuJWpcpb7UPUkGuvEA
File uploaded successfully. Source ID: src_k1S1dIe8pfZNFKuBOGbKL
File uploaded successfully. Source ID: src_8vUSVPKucvqOR5t3JMPUi
File uploaded successfully. Source ID: src_tBA4PBhXVRQxsNRfUlQlK
File uploaded successfully. Source ID: src_pQxs8WucJZtyQcR6yUaUH
File uploaded successfully. Source ID: src_Wo0orH8k4vv2NJigB8o6Q
File uploaded successfully. Source ID: src_4eo5b55NLS6JItgiXAD5Y
File uploaded successfull

KeyError: 'content'

In [None]:
# import after 133 where it broke
import os
import requests
from llama_index.core import Document

# Step 1: Define the directory path where your PDF files are stored
directory_path = os.path.join(os.getcwd(), "academic_papers")

# Step 2: API setup
upload_url = "https://api.chatpdf.com/v1/sources/add-file"
query_url = "https://api.chatpdf.com/v1/chats/message"
api_key = os.environ.get("CHATPDF_API_KEY")

# Step 3: Headers for the API requests
upload_headers = {
    'x-api-key': api_key
}
query_headers = {
    'x-api-key': api_key,
    "Content-Type": "application/json"
}

papers_documents = []
files_processed = 0

# Step 4: Iterate over PDF files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".pdf"):
        if files_processed >= 239:
            file_path = os.path.join(directory_path, filename)

            # Step 5: Upload the PDF file
            with open(file_path, 'rb') as file:
                files = [
                    ('file', (filename, file, 'application/octet-stream'))
                ]
                response = requests.post(upload_url, headers=upload_headers, files=files)
                if response.status_code == 200:
                    source_id = response.json()['sourceId']
                    print(f'File uploaded successfully. Source ID: {source_id}')

                    # Step 6: Prepare and send a single query with multiple requests
                    data = {
                        'sourceId': source_id,
                        'messages': [
                            {
                                'role': "user",
                                'content': "give a long summary with key findings, any biology explanations, and implications."
                            }
                        ]
                    }
                    citation_data = {
                        'sourceId': source_id,
                        'messages': [
                            {
                                'role': "user",
                                'content': "What is the article's MLA citation? Please only provide the citation in the response"
                            }
                        ]
                    }

                    try:
                        response = requests.post(query_url, headers=query_headers, json=data)
                        citation = requests.post(query_url, headers=query_headers, json=citation_data)

                        if 'content' in response.json() and 'content' in citation.json():
                            papers_documents.append(Document(text=response.json()['content'], metadata={"source": citation.json()['content'], "document_type": "academia"}))
                        else:
                            print(f"Skipping file {filename} due to missing 'content' key in response or citation.")

                    except KeyError as e:
                        print(f"KeyError occurred for file {filename}: {str(e)}. Skipping to the next file.")
                        continue
        else:
            files_processed += 1

File uploaded successfully. Source ID: src_ke4L9jgF5fxLdtKh75PtW
File uploaded successfully. Source ID: src_cC8GOol2NIn0TPK81skqm
File uploaded successfully. Source ID: src_rJiXqxhtVYzIb4qg6eep9
Skipping file fgwh-03-910220.pdf due to missing 'content' key in response or citation.
File uploaded successfully. Source ID: src_fS0obtjrZtpojycGporBy
Skipping file pone.0212673.pdf due to missing 'content' key in response or citation.
File uploaded successfully. Source ID: src_cbExnj1l3cT5SLOe9FkYK
Skipping file 1984-0462-rpp-40-e2020494.pdf due to missing 'content' key in response or citation.
File uploaded successfully. Source ID: src_LaiLbvSLXeIQ5f5Pi9opq
Skipping file jcm-11-03222.pdf due to missing 'content' key in response or citation.
File uploaded successfully. Source ID: src_6aburFBh6VUTjApJJk6dQ
Skipping file s41746-019-0152-7.pdf due to missing 'content' key in response or citation.


In [None]:
len(papers_documents)

2

## Import Tiktok

In [1]:
# Import tiktoks and ig reels from twelve labs output
from google.colab import drive
import pandas as pd
from llama_index.core import Document
# Uncomment this line if you're running the code in Google Colab to mount your Google Drive.
# drive.mount('/content/drive')

# Read data from a Google Sheet
sheet_id = '14Y91UTR4VXngNDwAL0gaBaPSSy5kcXNBYiuQ3WQ5VZM'
sheet_name = 'empirical evidence' #replace
encoded_sheet_name = quote(sheet_name)  # Encoding the sheet name for URL usage
url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={encoded_sheet_name}'
df = pd.read_csv(url)

# Extract the YouTube URLs from the first column (assuming no header, uncomment the next line if there is a header)
tiktok_documents = []

for doc in df:  # Limit the number of articles
        # Creating a new document for this article
        doc = Document(title=df.title, text=df.content, metadata={"source": df.url, "document_type": "empirical evidence"})
        tiktok_documents.append(doc)  # Use append for single items

session.close()

## Classify documents and store in seperate indicies

In [None]:
from llama_index.core import Document

def create_category_dict(documents, youtube = False):
  category_documents = {}
  for document in documents:
      content = document.text
      categories = classify_content(content)
      # Creating a new document for this article
      if youtube == True:
          new_document = Document(text=content, metadata={"categories": categories, "document_type": "professional opinions", "source": f"https://www.youtube.com/watch?v={document.metadata['video_id']}"})
      else:
        new_document = Document(text=content, metadata={"categories": categories, "source": document.metadata['source'], "document_type": document.metadata['document_type']})

      # Categorize this document under each category it belongs to
      for category in categories:
          if category not in category_documents:
              category_documents[category] = []
          category_documents[category].append(new_document)
  return category_documents

In [None]:
import marvin
# Function to classify the content and assign categories
def classify_content(content):
    categories = marvin.classify(
        content,
        ['fitness/wellness', 'mood/feeling', 'diet/nutrition', 'general'],
        instructions="Classify the content into the first three provided categories. If it doesn't fit any category, assign it to 'general'."
    )
    if not isinstance(categories, list):
        categories = [categories]
    return categories

In [None]:
def print_category_dict(category_dict):
  for key, value in category_dict.items():
    print (f"{key}: {len(value)} documents")

In [None]:
category_blogs = create_category_dict(blogs_documents)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    await self._connection.aclose()
  File "/usr/local/lib/python3.10/dist-packages/httpcore/_async/http11.py", line 265, in aclose
    await self._network_stream.aclose()
  File "/usr/local/lib/python3.10/dist-packages/httpcore/_backends/anyio.py", line 55, in aclose
    await self._stream.aclose()
  File "/usr/local/lib/python3.10/dist-packages/anyio/streams/tls.py", line 193, in aclose
    await self.transport_stream.aclose()
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 1261, in aclose
    self._transport.close()
  File "/usr/lib/python3.10/asyncio/selector_events.py", line 706, in close
    self._loop.call_soon(self._call_connection_lost, None)
  File "/usr/lib/python3.10/asyncio/base_events.py", line 753, in call_soon
    self._check_closed()
  File "/usr/lib/python3.10/asyncio/base_events.py", line 515, in _check_closed
    raise RuntimeError('Event loop is closed')
RuntimeError

In [None]:
print_category_dict(category_blogs)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
category_youtube = create_category_dict(youtube_documents, True)

In [None]:
# prompt: get the first element in the first key from category_youtube

print(category_youtube[list(category_youtube.keys())[0]][0])


Doc ID: c0a38b78-2ec1-4a2f-9409-e3ded0e24c63
Text: well hey guys i'm a board-certified dermatologist and in this
video we're going to be going over eight signs of polycystic ovary
syndrome or pcos this is a pretty common condition that affects women
of reproductive age it's characterized by elevated levels of the male
hormones androgens as well as irregular periods and in some cases the
ovaries ...


In [None]:
print_category_dict(category_blogs)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
category_papers = create_category_dict(papers_documents)

In [None]:
for key, value in category_papers.items():
    print (f"{key}: {len(value)} documents")

general: 2 documents


In [None]:
len(papers_documents)

133

In [None]:
category_tiktoks = create_category_dict(tiktok_documents)

# Upsert to Pinecone


In [None]:
from llama_index.core import VectorStoreIndex
from pinecone.grpc import PineconeGRPC
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import StorageContext, load_index_from_storage
import os

# Initialize Pinecone Connection
pc = PineconeGRPC(api_key= os.environ.get("PINECONE_API_KEY"))

def upsert_documents(category, category_dict):
    # Format category name for use in Pinecone index naming
    formatted_category = category.replace('/', '-').replace(' ', '-').lower()
    index_name = f"moonsync-index-{formatted_category}"
    pinecone_index = pc.Index(index_name)
    print(index_name)

    # Initialize VectorStore
    vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    # Retrieve documents for the current category
    documents = category_dict.get(category, [])

    # Initialize the vector index
    index = VectorStoreIndex.from_documents(
        documents, storage_context=storage_context
        )

In [None]:
#!pip install protoc_gen_openapiv2
#!pip install llama-index-vector-stores-pinecone

In [None]:
def print_upsert_nodes(category_dict):
  for category, docs in category_dict.items():
    print(f"Category '{category}': {len(docs)} documents")

In [None]:
for category in category_blogs:
    upsert_documents(category, category_blogs)

print_upsert_nodes(category_blogs)

moonsync-index-diet-nutrition


Upserted vectors:   0%|          | 0/842 [00:00<?, ?it/s]

moonsync-index-fitness-wellness


Upserted vectors:   0%|          | 0/1714 [00:00<?, ?it/s]

moonsync-index-general


Upserted vectors:   0%|          | 0/314 [00:00<?, ?it/s]

moonsync-index-mood-feeling


Upserted vectors:   0%|          | 0/374 [00:00<?, ?it/s]

Category 'diet/nutrition': 209 documents
Category 'fitness/wellness': 386 documents
Category 'general': 49 documents
Category 'mood/feeling': 64 documents


ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-3289' coro=<AsyncClient.aclose() done, defined at /usr/local/lib/python3.10/dist-packages/httpx/_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/httpx/_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "/usr/local/lib/python3.10/dist-packages/httpx/_transports/default.py", line 385, in aclose
    await self._pool.aclose()
  File "/usr/local/lib/python3.10/dist-packages/httpcore/_async/connection_pool.py", line 313, in aclose
    await self._close_connections(closing_connections)
  File "/usr/local/lib/python3.10/dist-packages/httpcore/_async/connection_pool.py", line 305, in _close_connections
    await connection.aclose()
  File "/usr/local/lib/python3.10/dist-packages/httpcore/_async/connection.py", line 171, in aclose
    await self._connection.aclose()
  File "/usr/local/lib/pyt

In [None]:
for category in category_youtube:
    upsert_documents(category, category_youtube)

print_upsert_nodes(category_youtube)

moonsync-index-fitness-wellness


Upserted vectors:   0%|          | 0/236 [00:00<?, ?it/s]

moonsync-index-general


Upserted vectors:   0%|          | 0/296 [00:00<?, ?it/s]

moonsync-index-mood-feeling


Upserted vectors:   0%|          | 0/97 [00:00<?, ?it/s]

moonsync-index-diet-nutrition


Upserted vectors:   0%|          | 0/65 [00:00<?, ?it/s]

Category 'fitness/wellness': 48 documents
Category 'general': 27 documents
Category 'mood/feeling': 27 documents
Category 'diet/nutrition': 20 documents


In [None]:
for category in category_papers:
    upsert_documents(category, category_papers)

print_upsert_nodes(category_papers)

moonsync-index-general


Upserted vectors:   0%|          | 0/2 [00:00<?, ?it/s]

Category 'general': 2 documents


In [None]:
for category in category_tiktoks:
    upsert_documents(category, category_tiktoks)

print_upsert_nodes(category_tiktoks)