In [6]:
import os
import matplotlib.pyplot as plt
import numpy as np
import openai

from utils import *

import KG_full as AKG
from supabase import create_client, Client
from typing import Optional
from dotenv import load_dotenv

In [7]:
# Load all configuration from .env
load_dotenv()

# OpenAI config from environment
COMPLETIONS_MODEL = os.getenv("OPENAI_API_MODEL", "gpt-4")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
my_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = my_api_key

os.environ['OPENAI_API_KEY'] = my_api_key
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# Supabase configuration from environment
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY") or os.getenv("SUPABASE_ANON_KEY")
SUPABASE_TABLE = os.getenv("SUPABASE_TABLE_NAME", "new_documents")
SUPABASE_QUERY_NAME = os.getenv("SUPABASE_QUERY_NAME")
SUPABASE_TEXT_COLUMN = os.getenv("SUPABASE_TEXT_COLUMN", "content")
SUPABASE_METADATA_COLUMN = os.getenv("SUPABASE_METADATA_COLUMN", "metadata")
SUPABASE_EMBEDDING_COLUMN = os.getenv("SUPABASE_EMBEDDING_COLUMN", "embedding")

# Create Supabase client if URL and KEY are provided
sb: Optional[Client] = None
if SUPABASE_URL and SUPABASE_KEY:
    sb = create_client(SUPABASE_URL, SUPABASE_KEY)

In [8]:
# If Supabase is configured, fetch cached chunks; else fall back to local processing
if sb is not None:
    import json
    rows = []
    # Prefer RPC if a query/function name is provided
    if SUPABASE_QUERY_NAME:
        try:
            rpc_result = sb.rpc(SUPABASE_QUERY_NAME).execute()
            rows = rpc_result.data or []
        except Exception as e:
            print(f"RPC '{SUPABASE_QUERY_NAME}' failed, falling back to table select. Error: {e}")
    if not rows:
        # Select content, metadata, embedding (if available), and id to derive sources
        select_cols = f"id,{SUPABASE_TEXT_COLUMN},{SUPABASE_METADATA_COLUMN}"
        if SUPABASE_EMBEDDING_COLUMN:
            select_cols += f",{SUPABASE_EMBEDDING_COLUMN}"
        rows = sb.table(SUPABASE_TABLE).select(select_cols).execute().data
    
    # Extract texts, sources, and optionally cached embeddings
    texts = []
    sources = []
    cached_embeddings = []
    has_embeddings = False
    
    for r in rows:
        # Handle both RPC row shapes and table rows
        text = r.get(SUPABASE_TEXT_COLUMN, r.get("content", ""))
        meta = r.get(SUPABASE_METADATA_COLUMN, r.get("metadata")) or {}
        # If metadata is a JSON string, try to parse it
        if isinstance(meta, str):
            try:
                meta = json.loads(meta)
            except Exception:
                meta = {}
        # Try a few reasonable keys in metadata for source; fall back to id
        src = (
            (meta.get("source") if isinstance(meta, dict) else None)
            or (meta.get("file_name") if isinstance(meta, dict) else None)
            or (meta.get("filename") if isinstance(meta, dict) else None)
            or r.get("id")
        )
        if text is not None and text != "":
            texts.append(text)
            sources.append(str(src) if src is not None else None)
            # Check for cached embeddings
            emb = r.get(SUPABASE_EMBEDDING_COLUMN) if SUPABASE_EMBEDDING_COLUMN else None
            if emb is not None:
                cached_embeddings.append(emb)
                has_embeddings = True
            else:
                cached_embeddings.append(None)
    
    if has_embeddings and len(cached_embeddings) == len(texts):
        print(f"Found {len([e for e in cached_embeddings if e is not None])} cached embeddings in Supabase")
    else:
        cached_embeddings = None
        print("No cached embeddings found, will compute them fresh")
else:
    directory = "raw_data"
    texts, sources = load_and_process_files(directory,
                                            chunk_size=200,
                                            separator=None)
    cached_embeddings = None

In [9]:
main_topic = os.getenv("MAIN_TOPIC", "Knowledge graph, Language Model")

KG_class = AKG.autoKG(texts=texts,
                         source=sources,
                         embedding_model=EMBEDDING_MODEL,
                         llm_model=COMPLETIONS_MODEL,
                         openai_api_key=OPENAI_API_KEY,
                         main_topic=main_topic,
                         embedding=True)

# If we have cached embeddings from Supabase, use them
if cached_embeddings is not None:
    print("Using cached embeddings from Supabase")
    # Note: You may need to set KG_class.embeddings directly if autoKG supports it
    # This depends on the KG_full implementation
    # If not supported, the library will compute embeddings fresh
    pass

In [10]:
# step one: remove duplicates
to_keep, to_delete, remains = KG_class.remove_same_text(use_nn=True, n_neighbors=25, thresh=1e-6, update=True)
print(len(to_keep))

In [11]:
# step two: extract keywords with two methods
core_list_1, all_tokens = KG_class.cluster(15,
                               clustering_method='NgJordanWeiss',
                               max_texts=15,
                               select_mtd='similarity',
                               prompt_language='English',
                               num_topics=10,
                               max_length=3,
                               post_process=True,
                               add_keywords=False,
                               verbose=False)
print("Number of keywords selected:", len(core_list_1))
print("Token used:", all_tokens)

core_list_2, all_tokens = KG_class.cluster(15,
                               clustering_method='k_means',
                               max_texts=15,
                               select_mtd='similarity',
                               prompt_language='English',
                               num_topics=10,
                               max_length=3,
                               post_process=True,
                               add_keywords=True)
print("Number of keywords selected:", len(core_list_2))
print("Token used:", all_tokens)

print("Number of keywords:", len(KG_class.keywords))

In [12]:
_ = KG_class.sub_entry_filter()
print("Number of keywords:", len(KG_class.keywords))

In [13]:
_, all_tokens = KG_class.final_keywords_filter()
print("Token used:", all_tokens)
print("Number of keywords:", len(KG_class.keywords))


In [14]:
KG_class.make_graph(30)
pred_mat, U_mat, A = KG_class.coretexts_seg_individual(k=30, trust_num=5, negative_multiplier=7, seg_mtd='laplace',
                                                return_mat=True, connect_threshold=0.2)

In [15]:
KG_class.get_dist_mat()
print(KG_class.check_completion())

In [16]:
deg_mat = np.sum(np.array(A.todense()) > 0, axis=0)
plt.hist(deg_mat)
plt.show()

In [18]:
KG_class.save_data(os.path.join('KG_data', 'ref_KG.npy'), include_texts=True)
