# Set up environment


## Install dependencies

In [1]:
!pip install -qU \
    marvin \
    llama-index \
    beautifulsoup4 \
    requests \
    pinecone-client \
    openai \
    llama-index-readers-youtube-transcript

## Set API Keys

In [2]:
# API Keys
import openai
import os
from pinecone import Pinecone, ServerlessSpec
from getpass import getpass
import sys
import os


os.environ["OPENAI_API_KEY"] = getpass("Please enter your OpenAI API key: ")
os.environ["PINECONE_API_KEY"] = getpass("Please enter your Pinecone API key: ")
os.environ["MARVIN_OPENAI_API_KEY"] = getpass("Please enter your Marvin API key: ")
os.environ["CHATPDF_API_KEY"] = getpass("Please enter your ChatPDF API key: ")

KeyboardInterrupt: Interrupted by user

# Load and classify the data

## Import Blogs

In [None]:
# Define URLs to scrape
import requests

def generate_urls(base_url, category, start_page=1, max_pages=None):
    urls = []
    page_number = start_page
    while True:
        url = f"{base_url}/category/{category}/page/{page_number}/"
        # Check if the page exists:
        response = requests.get(url)
        if response.status_code == 200:
            urls.append(url)
            if max_pages and len(urls) >= max_pages:
                break
        else:
            break  # Stop if the page does not exist
        page_number += 1
    return urls

def generate_simple_urls(base_url, max_pages=None):
    urls = []
    page_number = 1
    while True:
        url = f"{base_url}page/{page_number}/"
        # Check if the page exists:
        response = requests.get(url)
        if response.status_code == 200:
            urls.append(url)
            if max_pages and len(urls) >= max_pages:
                break
        else:
            break  # Stop if the page does not exist
        page_number += 1
    return urls

# Example usage
base_url_aviva = "https://avivaromm.com"
base_url_brighten = "https://drbrighten.com/articles/"

# Generate URLs while checking for actual page existence
urls_hormone = generate_urls(base_url_aviva, "balance-your-hormones", max_pages=50)
urls_menstrual = generate_urls(base_url_aviva, "menstrual-sexual-health", max_pages=50)
urls_conception = generate_urls(base_url_aviva, "fertility-conception", max_pages=50)
urls_pregnancy = generate_urls(base_url_aviva, "natural-pregnancy", max_pages=50)
urls_thyroid = generate_urls(base_url_aviva, "thyroid-support", max_pages=50)
urls_mood = generate_urls(base_url_aviva, "heal-mind-mood", max_pages=50)
urls_gut = generate_urls(base_url_aviva, "gut-immunity", max_pages=50)
urls_herbal = generate_urls(base_url_aviva, "herbal-medicine", max_pages=50)

urls_brighten = generate_simple_urls(base_url_brighten, max_pages=50)

all_urls = urls_brighten + urls_hormone + urls_menstrual + urls_conception + urls_pregnancy + urls_thyroid + urls_mood + urls_gut + urls_herbal

In [None]:
import requests
from bs4 import BeautifulSoup
from llama_index.core import Document

def scrape_blog_posts(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, "html.parser")
    for article in soup.find_all("article"):
        title = article.find("h2").text.strip()
        url = article.find("a")["href"]

        article_response = requests.get(url)
        article_soup = BeautifulSoup(article_response.text, "html.parser")
        content = article_soup.find("div", class_="entry-content").text.strip()

        # Creating a new document for this article
        blog = Document(title=title, text=content, metadata={"source": url, "document_type": "professional opinions"})
        blogs_documents.extend(blog)
    return blogs_documents

blogs_documents = []
for url in all_urls:
  blogs_documents.extend(scrape_blog_posts(url))

In [None]:
len(blogs_documents)

## Import Youtube

In [None]:
from google.colab import drive
import pandas as pd
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
from urllib.parse import quote, urlparse, parse_qs
# Uncomment this line if you're running the code in Google Colab to mount your Google Drive.
# drive.mount('/content/drive')

# Read data from a Google Sheet
sheet_id = '14Y91UTR4VXngNDwAL0gaBaPSSy5kcXNBYiuQ3WQ5VZM'
sheet_name = 'more content' #replace
encoded_sheet_name = quote(sheet_name)  # Encoding the sheet name for URL usage
url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={encoded_sheet_name}'
df = pd.read_csv(url)

# Extract the YouTube URLs from the first column (assuming no header, uncomment the next line if there is a header)
yturls = df.iloc[:, 0].tolist()

# Function to extract video IDs from YouTube URLs
def extract_video_id(url):
    parsed_url = urlparse(url)
    video_id = parse_qs(parsed_url.query).get('v', [None])[0]
    return video_id

video_ids = [extract_video_id(url) for url in yturls]

# Filter out any None values from video_ids
video_ids = [video_id for video_id in video_ids if video_id is not None]

# Correct the creation of YouTube URLs, removing the incorrect escaping of backslashes
ytlinks = [f"https://www.youtube.com/watch?v={video_id}" for video_id in video_ids]


# Collect categorized documents from academic videos
loader = YoutubeTranscriptReader()
youtube_documents = loader.load_data(ytlinks=ytlinks)

In [None]:
len(youtube_documents)

## Import Papers

In [57]:
import os
import requests
from llama_index.core import Document

# Step 1: Define the directory path where your PDF files are stored
directory_path = os.path.join(os.getcwd(), "academic_papers")

# Step 2: API setup
upload_url = "https://api.chatpdf.com/v1/sources/add-file"
query_url = "https://api.chatpdf.com/v1/chats/message"
api_key =  os.environ.get("CHATPDF_API_KEY")


# Step 3: Headers for the API requests
upload_headers = {
    'x-api-key': api_key
}

query_headers = {
    'x-api-key': api_key,
    "Content-Type": "application/json"
}

papers_documents = []
# Step 4: Iterate over PDF files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".pdf"):
        file_path = os.path.join(directory_path, filename)

        # Step 5: Upload the PDF file
        with open(file_path, 'rb') as file:
            files = [
                ('file', (filename, file, 'application/octet-stream'))
            ]
            response = requests.post(upload_url, headers=upload_headers, files=files)

            if response.status_code == 200:
                source_id = response.json()['sourceId']
                print(f'File uploaded successfully. Source ID: {source_id}')

                # Step 6: Prepare and send a single query with multiple requests
                data = {
                    'sourceId': source_id,
                    'messages': [
                        {
                            'role': "user",
                            'content': "give a long summary with key findings, any biology explanations, and implications."
                        }
                    ]
                }
                citation_data = {
                    'sourceId': source_id,
                    'messages': [
                        {
                            'role': "user",
                            'content': "What is the article's MLA citation? Please only provide the citation in the response"
                        }
                    ]
                }
                response = requests.post(query_url, headers=query_headers, json=data)
                citation = requests.post(query_url, headers=query_headers, json=citation_data)
                papers_documents.append(Document(text=response.json()['content'], metadata={"source": citation.json()['content'], "document_type": "academia"}))

File uploaded successfully. Source ID: src_gpvbPVwZFmOhGU6qcVzf2
File uploaded successfully. Source ID: src_u9AgbQt1U2Je3wmZqpLKm


In [58]:
len(papers_documents)

2

## Classify documents and store in seperate indicies

In [None]:
def create_category_dict(documents, youtube = False):
  category_documents = {}
  for document in documents:
      content = document.text
      categories = classify_content(content)
      # Creating a new document for this article
      if youtube == True:
          new_document = Document(text=content, metadata={"categories": categories, "document_type": "professional opinions", "source": f"https://www.youtube.com/watch?v={document.metadata['video_id']}"})
      else:
        new_document = Document(text=content, metadata={"categories": categories})

      # Categorize this document under each category it belongs to
      for category in categories:
          if category not in category_documents:
              category_documents[category] = []
          category_documents[category].append(new_document)
  return category_documents

In [None]:
import marvin
# Function to classify the content and assign categories
def classify_content(content):
    categories = marvin.classify(
        content,
        ['fitness/wellness', 'mood/feeling', 'diet/nutrition', 'general'],
        instructions="Classify the content into the first three provided categories. If it doesn't fit any category, assign it to 'general'."
    )
    if not isinstance(categories, list):
        categories = [categories]
    return categories

In [None]:
def print_category_dict(category_dict):
  for category, docs in category_dict.items():
    print(f"Category {category_dict}'{category}': {len(docs)} documents")

In [None]:
category_blogs = create_category_dict(blogs_documents)
print_category_dict(category_blogs)

In [None]:
category_youtube = create_category_dict(youtube_documents, True)
print_category_dict(category_youtube)

In [None]:
category_papers = create_category_dict(papers_documents)
print_category_dict(category_papers)

# Upsert to Pinecone


In [45]:
from llama_index.core import VectorStoreIndex
from pinecone.grpc import PineconeGRPC
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import StorageContext, load_index_from_storage
import os

# Initialize Pinecone Connection
pc = PineconeGRPC(api_key= os.environ.get("PINECONE_API_KEY"))

def upsert_documents(category, category_dict):
    # Format category name for use in Pinecone index naming
    formatted_category = category.replace('/', '-').replace(' ', '-').lower()
    index_name = f"moonsync-index-{formatted_category}"
    pinecone_index = pc.Index(index_name)
    print(index_name)

    # Initialize VectorStore
    vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    # Retrieve documents for the current category
    documents = category_dict.get(category, [])

    # Initialize the vector index
    index = VectorStoreIndex.from_documents(
        documents, storage_context=storage_context
        )

In [56]:
def print_upsert_nodes(category_dict):
  for category, docs in category_dict.items():
    print(f"Category '{category}': {len(docs)} documents")

'i3OYlaoj-BM'

In [None]:
for category in category_blogs:
    upsert_documents(category, category_blogs)

print_upsert_nodes(category_blogs)

In [None]:
for category in category_youtube:
    upsert_documents(category, category_youtube)

print_upsert_nodes(category_youtube)

In [None]:
for category in category_papers:
    upsert_documents(category, category_papers)

print_upsert_nodes(category_papers)