In [30]:
import os
from snowflake.sqlalchemy import URL
from sqlalchemy import create_engine
from dotenv import load_dotenv
import pandas as pd
import lancedb
from llama_index.core.schema import MetadataMode
from llama_index.core.node_parser import HTMLNodeParser
from llama_index.core import (
    Document,
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
)
from llama_index.vector_stores.lancedb import LanceDBVectorStore
# from transformers import MarkupLMFeatureExtractor, MarkupLMTokenizerFast, MarkupLMProcessor, MarkupLMModel, MarkupLMConfig
import html2text
from IPython.core.display import display, Markdown

  from IPython.core.display import display, Markdown


In [3]:
load_dotenv('../.env')

True

In [4]:
SNOWFLAKE_TABLE = 'DIM_DOCS_SITE_GA' 
SNOWFLAKE_SCHEMA = 'OB_DOCS_SITE_SCHEMA'

In [5]:
engine = create_engine(
    URL(
        user=os.environ["SNOWFLAKE_USER"],
        password=os.environ["SNOWFLAKE_PASSWORD"],
        account=os.environ["SNOWFLAKE_ACCOUNT_IDENTIFIER"],
        warehouse='google_analytics_wh',
        database=os.environ["SNOWFLAKE_DATABASE"],
        schema=SNOWFLAKE_SCHEMA,
        role=os.environ["SNOWFLAKE_OB_CONTENT_UNIVERSE_MF_TASK_ROLE"],
    )
)

In [6]:
df = pd.read_sql(f'SELECT * FROM {SNOWFLAKE_SCHEMA}.{SNOWFLAKE_TABLE}', engine)

In [7]:
df.sample(3)

Unnamed: 0,full_page_url,ga_query_start_date,ga_query_end_date,content,title,scrape_date,published_date,average_session_duration,stddev_average_session_duration,max_average_session_duration,avg_engagement_rate,avg_bounce_rate,sum_sessions
45,https://outerbounds.com/docs/debug-errors-with...,2024-04-07 06:50:23.636465,yesterday,"<main class=""docMainContainer_gTbr""><div class...",Debug Metaflow Errors with Resume | Outerbounds,2024-04-09 06:51:33.364070,,2.891434,4.089104,5.782867,1.0,0.0,2
79,https://outerbounds.com/docs/intro-tutorial-S1E1/,2024-04-07 06:50:23.636465,yesterday,"<main class=""docMainContainer_gTbr""><div class...",Your First Flow | Outerbounds,2024-04-09 06:51:33.364070,,35.505272,59.023406,159.507853,1.0,0.0,8
82,https://outerbounds.com/docs/intro-tutorial-ov...,2024-04-07 06:50:23.636465,yesterday,"<main class=""docMainContainer_gTbr""><div class...",Introduction to Metaflow Tutorial | Outerbounds,2024-04-09 06:51:33.364070,,1.591639,4.427878,19.188963,0.909091,0.090909,22


In [61]:
# filter out rows with no content
df = df[df['content'].notnull()]

In [7]:
# db = lancedb.connect("./.lancedb")
# db.create_table(LANCEDB_TABLENAME, data=df)
# db[LANCEDB_TABLENAME].head()

In [26]:
# h = html2text.HTML2Text()
# h.ignore_links = False
# # def _convert_html_to_markdown(html):
# #     return h.handle(example.content.values[0])
# example = df.sample(1)
# md = h.handle(example.content.values[0])
# display(Markdown(md)) 

In [31]:
# md_files = []
# for i, row in df.iterrows():
#     md_files.append(h.handle(row.content))

In [62]:
for _, row in df.iterrows():
    assert row.full_page_url is not None
    assert row.title is not None
    assert row.content is not None, row

In [63]:
documents = [
    Document(
        text=row.content,
        metadata={
            "full_page_url": row.full_page_url,
            "title": row.title
        },
        metadata_seperator="::",
        metadata_template="\n{key}=>{value}",
        text_template="Metadata: {metadata_str}\n-----\nContent: {content}"
    ) 
    for _, row in df.iterrows()
]
print(
    "The LLM sees this: \n\n\n",
    documents[0].get_content(metadata_mode=MetadataMode.LLM)[:200],
    "\n... [Truncated]"
)

print(
    "\n\n\nThe Embedding model sees this: \n",
    documents[0].get_content(metadata_mode=MetadataMode.EMBED)[:200],
    "\n... [Truncated]"
)

The LLM sees this: 


 Metadata: full_page_url=>https://outerbounds.com/docs/data-science-welcome/::
title=>Metaflow Resources for Data Science | Outerbounds
-----
Content: <main class="docMainContainer_gTbr"><div class="co 
... [Truncated]



The Embedding model sees this: 
 Metadata: full_page_url=>https://outerbounds.com/docs/data-science-welcome/::
title=>Metaflow Resources for Data Science | Outerbounds
-----
Content: <main class="docMainContainer_gTbr"><div class="co 
... [Truncated]


In [65]:
parser = HTMLNodeParser()

In [66]:
nodes = parser.get_nodes_from_documents(documents)

In [67]:
len(documents), len(nodes)

(144, 2534)

In [69]:
vector_store = LanceDBVectorStore(uri="./.lancedb")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(nodes, storage_context=storage_context)

In [42]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x298cc4110>

In [43]:
query_engine = index.as_query_engine()

In [44]:
response = query_engine.query("How to deploy Outerbounds?")

In [45]:
response

Response(response='aws eks update-kubeconfig --name <cluster name> configure', source_nodes=[NodeWithScore(node=TextNode(id_='4e66ba17-2d1b-4171-acf8-716a95c0cf0e', embedding=[0.015595958568155766, 0.02286861464381218, 0.0021746333222836256, -0.01994863525032997, -0.01099767442792654, 0.016687538474798203, -0.04270809143781662, -0.02634802833199501, -0.029936598613858223, -0.015664182603359222, 0.013051210902631283, -0.0015572080155834556, -0.0075319064781069756, -0.013658402487635612, -0.01158439926803112, 0.005526126828044653, 0.01904808171093464, -0.021408624947071075, 0.0030496034305542707, -0.039433348923921585, -0.04191669449210167, 0.004366322420537472, 0.004004736430943012, -0.005607995670288801, -0.02692110650241375, 0.0005726534291170537, 0.004997392650693655, -0.03569468483328819, 0.004721086006611586, -0.016196327283978462, 0.018638739362359047, 0.011140944436192513, -0.022841325029730797, -0.015541379339993, -0.009831047616899014, -0.008568908087909222, 0.01408139057457447

### Resume with regular lanceDB

In [70]:
import lancedb
db = lancedb.connect("./.lancedb")

In [74]:
db['vectors'].head()

pyarrow.Table
id: string
doc_id: string
vector: fixed_size_list<item: float>[1536]
  child 0, item: float
text: string
metadata: struct<_node_content: string, _node_type: string, doc_id: string, document_id: string, full_page_url: string, ref_doc_id: string, title: string>
  child 0, _node_content: string
  child 1, _node_type: string
  child 2, doc_id: string
  child 3, document_id: string
  child 4, full_page_url: string
  child 5, ref_doc_id: string
  child 6, title: string
----
id: [["4e66ba17-2d1b-4171-acf8-716a95c0cf0e"],["0859c34b-51d8-4bf6-aad5-3e6eff5af145","84db4c0e-7591-44b7-9060-4b1661c9b70c","af222eac-85f9-4ad2-91ea-91dca46a8571","4fa868a1-0996-4371-adad-218325fb834f"]]
doc_id: [["1c60f112-41d3-42cf-8d2d-5ef49bd4aa69"],["4e72869e-8942-492b-b0a0-7f47f49b8e77","4e72869e-8942-492b-b0a0-7f47f49b8e77","4e72869e-8942-492b-b0a0-7f47f49b8e77","4e72869e-8942-492b-b0a0-7f47f49b8e77"]]
vector: [[[0.015595959,0.022868615,0.0021746333,-0.019948635,-0.010997674,...,-0.0045573493,0.00698

### MarkupLM experiment

In [21]:
feature_extractor = MarkupLMFeatureExtractor()
tokenizer = MarkupLMTokenizerFast.from_pretrained("microsoft/markuplm-base")
processor = MarkupLMProcessor(feature_extractor, tokenizer)

In [34]:
html_string = df.sample(1)['content'].values[0]
encoding = processor(html_string, return_tensors="pt")

In [35]:
print(encoding.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq'])


In [36]:
type(encoding), len(encoding)

(transformers.tokenization_utils_base.BatchEncoding, 5)

In [37]:
encoding.items()

dict_items([('input_ids', tensor([[   0, 4148,   42,  ...,  110, 3041,    2]])), ('token_type_ids', tensor([[0, 0, 0,  ..., 0, 0, 0]])), ('attention_mask', tensor([[1, 1, 1,  ..., 1, 1, 1]])), ('xpath_tags_seq', tensor([[[216, 216, 216,  ..., 216, 216, 216],
         [124,  50,  50,  ..., 216, 216, 216],
         [124,  50,  50,  ..., 216, 216, 216],
         ...,
         [124,  50,  50,  ..., 216, 216, 216],
         [124,  50,  50,  ..., 216, 216, 216],
         [216, 216, 216,  ..., 216, 216, 216]]])), ('xpath_subs_seq', tensor([[[1001, 1001, 1001,  ..., 1001, 1001, 1001],
         [   0,    0,    0,  ..., 1001, 1001, 1001],
         [   0,    0,    0,  ..., 1001, 1001, 1001],
         ...,
         [   0,    0,    0,  ..., 1001, 1001, 1001],
         [   0,    0,    0,  ..., 1001, 1001, 1001],
         [1001, 1001, 1001,  ..., 1001, 1001, 1001]]]))])

In [38]:
configuration = MarkupLMConfig()
model = MarkupLMModel(configuration)

In [44]:
# encoding = processor("<html> <head> <title>Page Title</title> </head> </html>", return_tensors="pt")
# outputs = model(**encoding)

In [33]:
encoding

{'input_ids': tensor([[    0, 21823, 13497,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]]), 'xpath_tags_seq': tensor([[[216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216,
          216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216,
          216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216,
          216, 216, 216, 216, 216, 216, 216, 216],
         [109, 104, 200, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216,
          216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216,
          216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216,
          216, 216, 216, 216, 216, 216, 216, 216],
         [109, 104, 200, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216,
          216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216,
          216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216,
          216, 216, 21

In [53]:
enc = processor(html_string, return_tensors="pt")
enc.input_ids.shape, enc.attention_mask.shape, enc.token_type_ids.shape, enc.xpath_tags_seq.shape, enc.xpath_subs_seq.shape

(torch.Size([1, 5135]),
 torch.Size([1, 5135]),
 torch.Size([1, 5135]),
 torch.Size([1, 5135, 50]),
 torch.Size([1, 5135, 50]))

In [47]:
enc_demo = processor("<html> <head> <title>Page Title</title> </head> </html>", return_tensors="pt")

In [52]:
enc_demo.input_ids.shape, enc_demo.attention_mask.shape, enc_demo.token_type_ids.shape, enc_demo.xpath_tags_seq.shape, enc_demo.xpath_subs_seq.shape

(torch.Size([1, 4]),
 torch.Size([1, 4]),
 torch.Size([1, 4]),
 torch.Size([1, 4, 50]),
 torch.Size([1, 4, 50]))

In [50]:
enc_demo.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq'])