In [0]:
%pip install PyPDF2 langchain transformers sentencepiece

from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import LlamaTokenizer
from pyspark.sql.functions import col, explode
import pandas as pd

# Function to split text into chunks
def split_text_into_chunks(text: str, chunk_size: int, chunk_overlap: int) -> list:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)

# Initialize the tokenizer
tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")

In [0]:
%pip install PyPDF2 langchain transformers sentencepiece

from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import LlamaTokenizer
from pyspark.sql.functions import col, explode
import pandas as pd

# Function to split text into chunks
def split_text_into_chunks(text: str, chunk_size: int, chunk_overlap: int) -> list:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)

# Initialize the tokenizer
tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")

In [0]:
airbnb_raw = spark.sql("""
                       select * from sample_data.bright_initiative.airbnb_properties_information
                       where lower(location) like '%united states%'
                        and lower(location) like '%california%'
                       """)

display(airbnb_raw)

In [0]:
display(airbnb_raw)

In [0]:
%python
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

# Define a UDF to tokenize and combine the description
def tokenize_and_combine(description):
    if description is None:
        return ""
    tokens = tokenizer.tokenize(description)
    combined_string = " ".join(tokens)
    return combined_string

# Register the UDF
combine_udf = udf(tokenize_and_combine, StringType())

# Apply the UDF to the description column
airbnb_combined = airbnb_raw.withColumn(
    "combined_description",
    combine_udf(col("description"))
)

display(airbnb_combined)

In [0]:
%python
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, StringType

# Define a function to split text into chunks with overlap
def split_text_into_chunks(text, chunk_size=500, overlap=50):
    if text is None:
        return []
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i + chunk_size])
    return chunks

# Register the UDF
split_chunks_udf = udf(lambda text: split_text_into_chunks(text, 500, 50), ArrayType(StringType()))

# Apply the UDF to the combined_description column
airbnb_chunked = airbnb_combined.withColumn(
    "chunked_description",
    split_chunks_udf(col("combined_description"))
)

display(airbnb_chunked)

In [0]:
%python
from pyspark.sql.functions import explode

# Explode the chunked_description column
airbnb_exploded = airbnb_chunked.withColumn(
    "exploded_chunk",
    explode(col("chunked_description"))
)

display(airbnb_exploded)

In [0]:
# Write the DataFrame to the access_assist.curated table
airbnb_exploded.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("access_assist.curated.airbnb_exploded")

In [0]:
# Select the first 2000 rows
airbnb_exploded_limited = airbnb_exploded.limit(2000)

# Write the limited DataFrame to the access_assist.curated table
airbnb_exploded.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("access_assist.curated.airbnb_exploded2")

In [0]:
%sql
ALTER TABLE `access_assist`.`curated`.`airbnb_exploded` SET TBLPROPERTIES (delta.enableChangeDataFeed = true)

In [0]:
%python
%pip install databricks-vectorsearch
%pip install -U langchain-community
dbutils.library.restartPython()

from databricks.vector_search.client import VectorSearchClient
from langchain.vectorstores import DatabricksVectorSearch
from langchain.embeddings import DatabricksEmbeddings

vs_endpoint_name = "community_bricks_endpoint"
vs_index_full_name = "access_assist.curated.airbnb_exploded_vc_idex"

embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")

def get_retriever(persist_dir: str = None):
    vsc = VectorSearchClient()
    vs_index = vsc.get_index(vs_endpoint_name, vs_index_full_name)
    vectorstore = DatabricksVectorSearch(
        vs_index, text_column="__db_exploded_chunk_vector", embedding=embedding_model
    )
    return vectorstore.as_retriever(search_kwargs={"k": 5})

vectorstore = get_retriever()
similar_documents = vectorstore.invoke("""
                                       I am looking for an accessible hotel for disabled people with wheelchair access, an elevator, and rooms equipped with grab bars in the bathroom. It should also be close to public transportation and have accessible parking.
                                       """)
print(f"Relevant documents: {similar_documents}")

In [0]:
%python
# Ensure the Spark session is initialized at the start of the notebook
spark = SparkSession.builder.getOrCreate()

# Install necessary packages
%pip install numpy==1.21.6
%pip install databricks-vectorsearch
%pip install -U langchain-community
dbutils.library.restartPython()

from databricks.vector_search.client import VectorSearchClient
from langchain.vectorstores import DatabricksVectorSearch
from langchain.embeddings import DatabricksEmbeddings

vs_endpoint_name = "community_bricks_endpoint"
vs_index_full_name = "access_assist.curated.airbnb_exploded_vc_idex"

embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")

def get_retriever(persist_dir: str = None):
    vsc = VectorSearchClient()
    vs_index = vsc.get_index(vs_endpoint_name, vs_index_full_name)
    vectorstore = DatabricksVectorSearch(
        vs_index, text_column="__db_exploded_chunk_vector", embedding=embedding_model
    )
    return vectorstore.as_retriever(search_kwargs={"k": 5})

vectorstore = get_retriever()
similar_documents = vectorstore.invoke("""
                                       I am looking for an accessible hotel for disabled people with wheelchair access, an elevator, and rooms equipped with grab bars in the bathroom. It should also be close to public transportation and have accessible parking.
                                       """)
print(f"Relevant documents: {similar_documents}")

In [0]:
!pip install numpy==1.21.0from langchain.chat_models import ChatDatabricks


chat_model = ChatDatabricks(endpoint="databricks-llama-4-maverick", max_tokens=300)
print(f"Test chat model: {chat_model.invoke('What is Generative AI?')}")

In [0]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatDatabricks

TEMPLATE = """You are an assistant trying to find accessible places to stay .Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know

<context>
{context}
</context>

Question: {question}

Answer:
"""

prompt = PromptTemplate(
    template=TEMPLATE,
    input_variables=["context", "question"]
)

chain = RetrievalQA.from_chain_type(
    llm=chat_model,
    chain_type="stuff",
    retriever=get_retriever(),
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True
)

In [0]:
question = {"query": "How does Generative AI impact humans"}
answer = chain.invoke(question)
print(answer)