In [8]:
!pip install -qU \
    google-cloud-aiplatform \
    google-cloud-storage \
    llama-index \
    llama-index-embeddings-vertex \
    llama-index-llms-vertex \
    llama-index-vector_stores-vertexaivectorsearch \
    llama-index-llms-fireworks \
    llama-index-embeddings-huggingface

In [9]:
# Imports
import os

from google.cloud import aiplatform, storage
from llama_index.core import (
    Document,
    PromptTemplate,
    Settings,
    SimpleDirectoryReader,
    StorageContext,
    SummaryIndex,
    VectorStoreIndex,
)
from llama_index.core.agent import ReActAgent
from llama_index.core.base.base_query_engine import BaseQueryEngine
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.objects import ObjectIndex
from llama_index.core.prompts import LangchainPromptTemplate
from llama_index.core.prompts.base import BasePromptTemplate
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.embeddings.vertex import VertexTextEmbedding
from llama_index.llms.vertex import Vertex
from llama_index.vector_stores.vertexaivectorsearch import VertexAIVectorStore

In [10]:
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


In [11]:

import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [3]:
from llama_index.llms.vertex import Vertex

In [4]:
PROJECT_ID = "angelic-bee-193823"  
LOCATION = "us-central1"  

import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [6]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
llm = Vertex("gemini-pro")

Settings.embed_model = embed_model
Settings.llm = llm

In [11]:
from typing import Dict, List
from llama_index.core.tools import FunctionTool
import requests
import json
from bs4 import BeautifulSoup

def get_devfest_schedule() -> Dict[str, List[Dict]]:
    """
    Scrape and return the DevFest Lagos schedule as JSON data.
    Returns a dictionary with days as keys and lists of session information as values.
    
    Example output:
    {
        "day1": [
            {
                "title": "Opening Keynote",
                "speaker": "John Doe",
                "time": "9:00 AM",
                "track": "General",
                "room": "Main Hall",
                "session_type": "General"
            },
            ...
        ],
        "day2": [...]
    }
    """
    # Initialize schedule structure
    schedule = {
        'day1': [],
        'day2': []
    }
    
    try:
        # Set up session and get page content
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        })
        
        # Get HTML content
        response = session.get("https://devfestlagos.com/schedule")
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find schedule container
        schedule_container = soup.find('div', class_='schedule_scheduleItemsContainer__wkWNt')
        
        if schedule_container:
            # Extract general sessions
            general_events = schedule_container.find_all('div', class_='EventBlock_event__UsJua')
            for event in general_events:
                session_data = {
                    'title': event.find('h3').text.strip(),
                    'time': event.find('div', class_='EventBlock_time__RQGQz').text.strip() if event.find('div', class_='EventBlock_time__RQGQz') else "Time not specified",
                    'room': event.find('div', class_='EventBlock_venue__wjpVu').find('span').text.strip() if event.find('div', class_='EventBlock_venue__wjpVu') and event.find('div', class_='EventBlock_venue__wjpVu').find('span') else "Main Hall",
                    'speaker': "N/A",
                    'track': "General",
                    'session_type': "General",
                    'day': "Day 1"
                }
                schedule['day1'].append(session_data)
            
            # Extract breakout sessions
            breakout_container = schedule_container.find('div', class_='EventCategory_eventSchedule__events__cCu22')
            if breakout_container:
                breakout_events = breakout_container.find_all('div', class_='EventCategory_eventSchedule__event__AhbY3')
                for event in breakout_events:
                    session_data = {
                        'title': event.find('h3', class_='EventCategory_eventSchedule__event-title__F2air').text.strip() if event.find('h3', class_='EventCategory_eventSchedule__event-title__F2air') else "No Title",
                        'speaker': event.find('p', class_='EventCategory_eventSchedule__event-facilitator__nWvuU').text.strip() if event.find('p', class_='EventCategory_eventSchedule__event-facilitator__nWvuU') else "Not specified",
                        'time': event.find('div', class_='EventCategory_eventSchedule__event-time__f_zfq').find('span', class_='text-sm').text.strip() if event.find('div', class_='EventCategory_eventSchedule__event-time__f_zfq') else "Time not specified",
                        'room': "Breakout Room",
                        'track': "Breakout",
                        'session_type': "Breakout",
                        'day': "Day 1"
                    }
                    schedule['day1'].append(session_data)
    
    except Exception as e:
        print(f"Error scraping schedule: {str(e)}")
        
    return schedule

# Create the Llama Index Function Tool
devfest_schedule_tool = FunctionTool.from_defaults(
    fn=get_devfest_schedule,
    name="get_devfest_schedule",
    description="Get the complete DevFest Lagos schedule as JSON data"
)

# Example usage in Jupyter notebook
if __name__ == "__main__":
    # Get the schedule
    schedule_data = get_devfest_schedule()
    
    # Display as JSON in notebook
    from IPython.display import JSON, display
    display(JSON(schedule_data))
    
    # Save to file
    with open('devfest_schedule.json', 'w', encoding='utf-8') as f:
        json.dump(schedule_data, f, ensure_ascii=False, indent=2)

<IPython.core.display.JSON object>

In [6]:
# Example usage in Jupyter notebook
if __name__ == "__main__":
    # Get the schedule
    schedule_data = get_devfest_schedule()
    
    # Display as JSON in notebook
    from IPython.display import JSON, display
    display(JSON(schedule_data))
    
    # Save to file
    with open('devfest_schedule.json', 'w', encoding='utf-8') as f:
        json.dump(schedule_data, f, ensure_ascii=False, indent=2)

<IPython.core.display.JSON object>

In [7]:
schedule_data

{'day1': [{'title': 'Registration',
   'time': '8:00 AM - 9:00 AM',
   'room': 'outside',
   'speaker': 'N/A',
   'track': 'General',
   'session_type': 'General',
   'day': 'Day 1'},
  {'title': 'Welcome to DevFest Lagos, Ground Rules and Safety',
   'time': '9:05 AM - 9:10 AM',
   'room': 'room 1',
   'speaker': 'N/A',
   'track': 'General',
   'session_type': 'General',
   'day': 'Day 1'},
  {'title': 'Keynote',
   'time': '9:15 AM - 9:45 AM',
   'room': 'room 1',
   'speaker': 'N/A',
   'track': 'General',
   'session_type': 'General',
   'day': 'Day 1'},
  {'title': 'TBD',
   'time': '9:50 AM - 10:30 AM',
   'room': 'room 1',
   'speaker': 'N/A',
   'track': 'General',
   'session_type': 'General',
   'day': 'Day 1'},
  {'title': 'Building Resilience Systems - Flexibility and Adaptability',
   'time': '9:50 AM - 10:30 AM',
   'room': 'room 2',
   'speaker': 'N/A',
   'track': 'General',
   'session_type': 'General',
   'day': 'Day 1'},
  {'title': 'TBD',
   'time': '10:35 AM - 11:

In [14]:
from typing import Dict, List
from llama_index.core.tools import FunctionTool
from llama_index.core.schema import Document
import requests
import json
from bs4 import BeautifulSoup
from dataclasses import dataclass
from IPython.display import JSON, display

@dataclass
class DevFestSession:
    """Data class for storing session information"""
    title: str
    speaker: str
    time: str
    track: str
    day: str
    room: str
    session_type: str

def get_devfest_schedule() -> Dict[str, List[Dict]]:
    """
    Scrape and return the DevFest Lagos schedule as JSON data.
    Returns a dictionary with days as keys and lists of session information as values.
    """
    # Initialize schedule structure
    schedule = {
        'day1': [],
        'day2': []
    }
    
    try:
        # Set up session and get page content
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        })
        
        # Get HTML content
        response = session.get("https://devfestlagos.com/schedule")
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find schedule container
        schedule_container = soup.find('div', class_='schedule_scheduleItemsContainer__wkWNt')
        
        if schedule_container:
            # Extract general sessions
            general_events = schedule_container.find_all('div', class_='EventBlock_event__UsJua')
            for event in general_events:
                session_data = {
                    'title': event.find('h3').text.strip(),
                    'time': event.find('div', class_='EventBlock_time__RQGQz').text.strip() if event.find('div', class_='EventBlock_time__RQGQz') else "Time not specified",
                    'room': event.find('div', class_='EventBlock_venue__wjpVu').find('span').text.strip() if event.find('div', class_='EventBlock_venue__wjpVu') and event.find('div', class_='EventBlock_venue__wjpVu').find('span') else "Main Hall",
                    'speaker': "N/A",
                    'track': "General",
                    'session_type': "General",
                    'day': "Day 1"
                }
                schedule['day1'].append(session_data)
            
            # Extract breakout sessions
            breakout_container = schedule_container.find('div', class_='EventCategory_eventSchedule__events__cCu22')
            if breakout_container:
                breakout_events = breakout_container.find_all('div', class_='EventCategory_eventSchedule__event__AhbY3')
                for event in breakout_events:
                    session_data = {
                        'title': event.find('h3', class_='EventCategory_eventSchedule__event-title__F2air').text.strip() if event.find('h3', class_='EventCategory_eventSchedule__event-title__F2air') else "No Title",
                        'speaker': event.find('p', class_='EventCategory_eventSchedule__event-facilitator__nWvuU').text.strip() if event.find('p', class_='EventCategory_eventSchedule__event-facilitator__nWvuU') else "Not specified",
                        'time': event.find('div', class_='EventCategory_eventSchedule__event-time__f_zfq').find('span', class_='text-sm').text.strip() if event.find('div', class_='EventCategory_eventSchedule__event-time__f_zfq') else "Time not specified",
                        'room': "Breakout Room",
                        'track': "Breakout",
                        'session_type': "Breakout",
                        'day': "Day 1"
                    }
                    schedule['day1'].append(session_data)
    
    except Exception as e:
        print(f"Error scraping schedule: {str(e)}")
        
    return schedule

def convert_to_documents(schedule_data: Dict[str, List[Dict]]) -> List[Document]:
    """
    Convert schedule data into LlamaIndex Document objects
    """
    documents = []
    
    for day, sessions in schedule_data.items():
        for session in sessions:
            # Create formatted content
            content = f"""
            Title: {session['title']}
            Speaker: {session['speaker']}
            Time: {session['time']}
            Track: {session['track']}
            Room: {session['room']}
            Session Type: {session['session_type']}
            Day: {session['day']}
            """
            
            # Create metadata for better querying
            metadata = {
                "day": session['day'],
                "track": session['track'],
                "session_type": session['session_type'],
                "speaker": session['speaker'],
                "time": session['time'],
                "room": session['room']
            }
            
            # Create Document object
            doc = Document(
                text=content,
                metadata=metadata
            )
            documents.append(doc)
    
    return documents

class DevFestScheduleTool:
    """Tool for handling DevFest schedule data"""
    
    def __init__(self):
        self.schedule_data = None
        self.documents = None
    
    def get_schedule(self) -> Dict[str, List[Dict]]:
        """Get schedule data"""
        if not self.schedule_data:
            self.schedule_data = get_devfest_schedule()
        return self.schedule_data
    
    def get_documents(self) -> List[Document]:
        """Get schedule as Document objects"""
        if not self.documents:
            schedule_data = self.get_schedule()
            self.documents = convert_to_documents(schedule_data)
        return self.documents
    
    def save_schedule(self, filename: str = 'devfest_schedule.json'):
        """Save schedule to JSON file"""
        schedule_data = self.get_schedule()
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(schedule_data, f, ensure_ascii=False, indent=2)
    
    def display_schedule(self):
        """Display schedule in notebook"""
        schedule_data = self.get_schedule()
        display(JSON(schedule_data))

# Create the Llama Index Function Tool
devfest_schedule_tool = FunctionTool.from_defaults(
    fn=get_devfest_schedule,
    name="get_devfest_schedule",
    description="Get the complete DevFest Lagos schedule as JSON data and convert to Document objects for indexing"
)

# Example usage in Jupyter notebook
if __name__ == "__main__":
    # Create tool instance
    tool = DevFestScheduleTool()
    
    # Get and display schedule
    print("Fetching DevFest schedule...")
    tool.display_schedule()
    
    # Get documents
    print("\nConverting to Documents...")
    documents = tool.get_documents()
    print(f"Created {len(documents)} Document objects")
    
    # Save schedule
    tool.save_schedule()
    print("\nSchedule saved to devfest_schedule.json")
    
    # Example of document content
    if documents:
        print("\nExample Document content:")
        print(documents[0].text)
        print("\nExample Document metadata:")
        print(documents[0].metadata)

Fetching DevFest schedule...


<IPython.core.display.JSON object>


Converting to Documents...
Created 18 Document objects

Schedule saved to devfest_schedule.json

Example Document content:

            Title: Registration
            Speaker: N/A
            Time: 8:00 AM - 9:00 AM
            Track: General
            Room: outside
            Session Type: General
            Day: Day 1
            

Example Document metadata:
{'day': 'Day 1', 'track': 'General', 'session_type': 'General', 'speaker': 'N/A', 'time': '8:00 AM - 9:00 AM', 'room': 'outside'}


In [15]:
# Basic usage
tool = DevFestScheduleTool()

# Get JSON data
schedule_data = tool.get_schedule()
tool.display_schedule()

# Get Document objects
documents = tool.get_documents()

# Save to file
tool.save_schedule()

# Access individual documents
print(f"First session: {documents[0].text}")

<IPython.core.display.JSON object>

First session: 
            Title: Registration
            Speaker: N/A
            Time: 8:00 AM - 9:00 AM
            Track: General
            Room: outside
            Session Type: General
            Day: Day 1
            


In [16]:
# Get documents
tool = DevFestScheduleTool()
documents = tool.get_documents()

# Create index
index = VectorStoreIndex.from_documents(documents)

# Create query engine
query_engine = index.as_query_engine()

# Query the schedule
response = query_engine.query("What sessions are about web development?")
print(response)

## Web Development Sessions at the Conference

Based on the provided context, there is one session specifically focused on web development:

**Building Web Apps That Work Offline and Beyond**

* **Speaker:** Confidence Okoghenun
* **Time:** 1:50 PM - 2:35 PM
* **Track:** Breakout
* **Room:** Breakout Room
* **Day:** Day 1

This session will likely cover topics such as:

* Building web applications that can function offline
* Techniques for enhancing the performance and functionality of web apps
* Strategies for creating web apps that are accessible and user-friendly

While the title of the session mentions "offline" functionality, it also suggests that it will delve into broader aspects of web development beyond just offline capabilities. 

**Additionally, the "Empowering Development at Every Scale with FlutterFlow" session may also be relevant to web development, as FlutterFlow is a platform for building cross-platform apps, including web apps.** 

This session is scheduled for:

* **

In [14]:
documents

[Document(id_='9131b074-fe68-4f25-a30a-143ce2d2ae23', embedding=None, metadata={'day': 'Day 1', 'track': 'General', 'session_type': 'General', 'speaker': 'N/A', 'time': '8:00 AM - 9:00 AM', 'room': 'outside'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='\n            Title: Registration\n            Speaker: N/A\n            Time: 8:00 AM - 9:00 AM\n            Track: General\n            Room: outside\n            Session Type: General\n            Day: Day 1\n            ', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='fcdb667a-90b2-4e99-9b30-57a8de53391b', embedding=None, metadata={'day': 'Day 1', 'track': 'General', 'session_type': 'General', 'speaker': 'N/A', 'time': '9:05 AM - 9:10 AM', 'room': 'room 1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='\n      

In [15]:
  # build index
devfest_index = VectorStoreIndex.from_documents(schedule_data)

AttributeError: 'str' object has no attribute 'get_doc_id'