In [9]:
# # Install with pip install firecrawl-py
from firecrawl import FirecrawlApp
import requests
import json
import os 
from tiktoken import encoding_for_model
from supabase import create_client, Client
from dotenv import load_dotenv  
from typing import List, Dict
load_dotenv()

FIRECRAWL_API = os.getenv("FIRECRAWL_API_KEY")
app = FirecrawlApp(api_key=FIRECRAWL_API)

async def count_tokens(text, model="gpt-4o"):
    encoder = encoding_for_model(model)
    tokens = encoder.encode(text)
    return len(tokens)

JINA_API_KEY = os.getenv('JINA_API_KEY')
async def get_embedding(text):
    """ JINA EMBEDDINGS """
    url = 'https://api.jina.ai/v1/embeddings'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {JINA_API_KEY}'
    }

    data = {
        "model": "jina-embeddings-v3",
        "task": "retrieval.passage",
        "dimensions": 1024,
        "late_chunking": False,
        "embedding_type": "float",
        "input": text
    }
    response = requests.post(url, headers=headers, data=json.dumps(data))
    return response.json()#['data'][0]['embedding']

async def sliding_window_chunking(text, max_window_size=900, overlap=200):
    encoder = encoding_for_model("gpt-4o")  # Use the same model as in count_tokens
    tokens = encoder.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_window_size
        chunk_tokens = tokens[start:end]
        chunk = encoder.decode(chunk_tokens)
        chunks.append(chunk)
        start += max_window_size - overlap
    return chunks

SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
SUPABASE_ANON_KEY = os.getenv("SUPABASE_KEY")

supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)

# map_result = app.map_url('periperi.co.uk/', params={
# 	'includeSubdomains': True
# })



### Decide what to embed. The "chunks" will be:
- transcripts
- summaries
- lead
- call type

In [None]:
# Get conversation logs from Supabase and format them
def get_conversation_logs():
    response = supabase.table('conversation_logs').select('*').execute()
    formatted_logs = []
    
    for log in response.data:
        # Skip if summary starts with "no" (case insensitive)
        if log['summary'] and log['summary'].lower().startswith('• no'):
            continue
            
        # Combine summary, lead and call_type for header
        header_parts = []
        if log['summary'] is not None:
            if log.get('summary'):
                header_parts.append(str("# Call summary: " + log['summary']))
            if log.get('lead'):
                header_parts.append(str("# Lead?: " + log['lead']))
            if log.get('call_type'):
                header_parts.append(str("# Call Type?: " + log['call_type']))
                       
        header = " ".join(header_parts)
        
        # Create formatted dictionary
        formatted_log = {
            log['id']: {
                "header": header,
                "content": "# Transcript: " + log.get('transcript', '')
            }
        }
        formatted_logs.append(formatted_log)
    
    return formatted_logs

conversation_logs = get_conversation_logs()
conversation_logs


In [None]:
import pandas as pd

# First, let's normalize the data
normalized_data = {}
for item in conversation_logs:  # assuming your list is called conversation_logs
    for key, value in item.items():
        normalized_data[key] = value

# Create DataFrame
logs = pd.DataFrame.from_dict(normalized_data, orient='index')

logs

In [21]:
log

{'4d37d8dc-b47c-4bfb-bab7-e34068169a96': {'header': '# Call summary: - No conversation was held about oxygen\n- Caller asked about how Voxigen works, not oxygen\n- Assistant provided information about Voxigen\'s features and capabilities\n- Caller seemed to have trouble hearing or connecting, asking "Hello? Hello?"\n- Caller requested to be put through to someone else # Call Type?: web',
  'content': '# Transcript: [{"assistant_message": " I\'m here to help you discover Voxigen. Anything that\'s caught your eye on our landing page?"}, {"user_message": "Yeah, can you tell me how oxygen works?"}, {"assistant_message": null}, {"tool": {"name": "question_and_answer", "content": "Found matching products/services: [{\'id\': 1413, \'url\': \'https://voxigen.io/zh/content/terms-of-service\', \'header\': \'## Title: VoxiGen ## Description: VoxiGen\', \'content\': \'\\u2190 \\u8fd4\\u56de\\\\n\\\\n# VoxiGen\\u670d\\u52a1\\u6761\\u6b3e\\\\n\\\\n\\u60a8\\u597d :)\\\\n\\\\n\\u6b22\\u8fce\\u4f7f\\u7

In [20]:
# Get the ID where it stopped
stopped_id = list(stopped.keys())[0]

# Filter conversation_logs to start after the stopped ID
found_stopped = False
for log in conversation_logs:
    log_id = list(log.keys())[0]
    
    # Skip until we find the stopped ID
    if not found_stopped:
        if log_id == stopped_id:
            found_stopped = True
        continue
    
    # Process remaining logs
    log_data = log[log_id]
    jina_response = await get_embedding(str(log_data))

    sb_insert['id'] = log_id
    sb_insert['user_id'] = log_id
    sb_insert['jina_embedding'] = jina_response['data'][0]['embedding']
    sb_insert['token_count'] = jina_response['usage']['total_tokens']
    await insert_to_db(sb_insert)

KeyError: 'data'

In [11]:
sb_insert = {
    "id": "",  # Add ID field
    "token_count": 0,
    "jina_embedding": "",
    "user_id": "user_2mmXezcGmjZCf88gT2v2waCBsXv"
}


async def insert_to_db(data):
    print("inserting to db")
    # Use upsert instead of insert to handle existing IDs
    supabase.table('conversation_logs').upsert(data).execute()



""" provide a list of urls to scrape, index and embed """
for log in conversation_logs:
    # Get the ID from the first (and only) key in the dictionary
    log_id = list(log.keys())[0]
    log_data = log[log_id]
    
    jina_response = await get_embedding(str(log_data))

    sb_insert['id'] = log_id  # Set the ID for the row
    sb_insert['user_id'] = log_id  # Update user_id to use log_id
    sb_insert['jina_embedding'] = jina_response['data'][0]['embedding']
    sb_insert['token_count'] = jina_response['usage']['total_tokens']
    await insert_to_db(sb_insert)



inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
inserting to db
insertin

KeyError: 'data'

In [15]:
stopped

{'171d834d-6a1f-434e-99ab-ff348783e58c': {'header': '# Call summary: • A1 Performance offers individual programs for athletes, coaches, parents, and managers\n• Team programs include operational support and long-term development options\n• Programs for clubs and associations offer partial or comprehensive development\n• Content and duration of programs are customized based on client discussions\n• Programs provide comprehensive mental preparation, environment analysis, vision/culture setting\n• They include development of key staff, age-specific mental strategies for athletes, and practical development for coaches\n• Club-player-parent cooperation and scouting support through mental diagnostics are part of the programs # Call Type?: web',
  'content': '# Transcript: [{"assistant_message": " Hi, we\'re all about helping athletes achieve A1 performance. What brings you here?"}, {"user_message": "Can you tell me what programs that you offer?"}, {"assistant_message": null}, {"tool": {"name