In [1]:
# # Install with pip install firecrawl-py
from firecrawl import FirecrawlApp
import requests
import json
import os 
from tiktoken import encoding_for_model
from supabase import create_client, Client
from dotenv import load_dotenv  

load_dotenv()

FIRECRAWL_API = os.getenv("FIRECRAWL_API_KEY")
app = FirecrawlApp(api_key=FIRECRAWL_API)

async def count_tokens(text, model="gpt-4o"):
    encoder = encoding_for_model(model)
    tokens = encoder.encode(text)
    return len(tokens)

JINA_API_KEY = os.getenv('JINA_API_KEY')
async def get_embedding(text):
    """ JINA EMBEDDINGS """
    url = 'https://api.jina.ai/v1/embeddings'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {JINA_API_KEY}'
    }

    data = {
        "model": "jina-embeddings-v3",
        "task": "retrieval.passage",
        "dimensions": 1024,
        "late_chunking": False,
        "embedding_type": "float",
        "input": text
    }
    response = requests.post(url, headers=headers, data=json.dumps(data))
    return response.json()#['data'][0]['embedding']

async def sliding_window_chunking(text, max_window_size=900, overlap=200):
    encoder = encoding_for_model("gpt-4o")  # Use the same model as in count_tokens
    tokens = encoder.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_window_size
        chunk_tokens = tokens[start:end]
        chunk = encoder.decode(chunk_tokens)
        chunks.append(chunk)
        start += max_window_size - overlap
    return chunks

SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
SUPABASE_ANON_KEY = os.getenv("SUPABASE_KEY")

supabase: Client = create_client(SUPABASE_URL, SUPABASE_ANON_KEY)

# map_result = app.map_url('periperi.co.uk/', params={
# 	'includeSubdomains': True
# })

async def insert_to_db(data):
    print("inserting to db")
    supabase.table('user_web_data').insert(data).execute()


### Decide what to embed. The "chunks" will be:
- transcripts
- summaries
- lead
- call type

In [12]:
# Get conversation logs from Supabase and format them
def get_conversation_logs():
    response = supabase.table('conversation_logs').select('*').execute()
    formatted_logs = []
    
    for log in response.data:
        # Combine summary, lead and call_type for header
        header_parts = []
        if "transcript" in log:
            if log.get('summary'):
                header_parts.append(str("# Call summary: " + log['summary']))
            if log.get('lead'):
                header_parts.append(str("# Lead?: " + log['lead']))
            if log.get('call_type'):
                header_parts.append(str("# Call Type?: " + log['call_type']))
            
        header = " ".join(header_parts)
        
        # Create formatted dictionary
        formatted_log = {
            log['id']: {
                "header": header,
                "content": "# Transcript: " + log.get('transcript', '')
            }
        }
        formatted_logs.append(formatted_log)
    
    return formatted_logs

conversation_logs = get_conversation_logs()
conversation_logs


[{'7166f6a8-aadd-48ee-bf1a-20edfa5effcc': {'header': '# Call summary: • Corporate event for a 2-year milestone\n• Occasion inquired about by assistant\n• Budget range requested by assistant\n• Clarification sought on decision-making role of caller',
   'content': '# Transcript: [{"assistant_message": " Hello there, you can ask me anything about how we host events, and ensure a memorable experience. So, what is the occasion?"}, {"user_message": "Hello. So the occasion is a, it\'s a corporate event for a 2 year"}, {"assistant_message": " Great! Corporate events are our specialty. May I know your budget range for this event and are you the decision-maker or part of a team planning this?"}]'}},
 {'fbb3a1fd-26b5-4b9c-b97b-082622b82143': {'header': '',
   'content': '# Transcript: [{"assistant_message": " Hello there, you can ask me anything about how we host events, and ensure a memorable experience. So, what is the occasion?"}, {"user_message": "Hello?"}, {"user_message": "Yep. So we\'ve g

In [5]:
sb_insert = {
    "url": "",
    "header": "",
    "content": "",
    "token_count": 0,
    "jina_embedding": "",
    "user_id": "user_2mmXezcGmjZCf88gT2v2waCBsXv"
}

peri_sites = ["https://www.periperi.co.uk/our-price-fees-guide/"]

""" provide a list of urls to scrape, index and embed """
go = False
for site in peri_sites:
    # if site == "https://www.periperi.co.uk/clearscore-corporate-event-showcase":
    #     go = True
    # print(site)
    
    # if go:
    try:
        response = app.scrape_url(url=site, params={
            'formats': [ 'markdown' ],
            'waitFor': 1000
        })

        content = [item for item in response['markdown'].split('\n\n') if not item.startswith('[![]')]
        content = "\n\n".join(content)
        header = "## Title: " + response['metadata']['title'] + " ## Description: " + response['metadata']['description']
        chunks = await sliding_window_chunking(content)

        for chunk in chunks: 
            print(f"processing chunk {chunks.index(chunk)} of {len(chunks)}")
            sb_insert['url'] = site
            sb_insert['header'] = header
            sb_insert['content'] = chunk
            chunk = header + chunk
            jina_response = await get_embedding(chunk)
            sb_insert['jina_embedding'] = jina_response['data'][0]['embedding']
            sb_insert['token_count'] = jina_response['usage']['total_tokens']

            await insert_to_db(sb_insert)

    except KeyError as e:
        print(f"KeyError occurred for site {site}: {str(e)}")
        print("Proceeding without description")
        chunks = await sliding_window_chunking(content)
        header = "## Title: " + response['metadata']['title'] 

        for chunk in chunks: 
            print(f"processing chunk {chunks.index(chunk)} of {len(chunks)}")
            sb_insert['url'] = site
            sb_insert['header'] = header
            sb_insert['content'] = chunk
            chunk = header + chunk
            jina_response = await get_embedding(chunk)
            sb_insert['jina_embedding'] = jina_response['data'][0]['embedding']
            sb_insert['token_count'] = jina_response['usage']['total_tokens']

            await insert_to_db(sb_insert)
        continue

KeyError occurred for site https://www.periperi.co.uk/our-price-fees-guide/: 'description'
Proceeding without description
processing chunk 0 of 2
inserting to db


APIError: {'code': '42501', 'details': None, 'hint': None, 'message': 'new row violates row-level security policy for table "user_web_data"'}