In [None]:
%pip install faiss-cpu sentence-transformers pandas
try:
    dbutils.library.restartPython()
except NameError:
    print("dbutils not defined. Skipping restart.")

In [None]:
try:
    spark
except NameError:
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.getOrCreate()

try:
    display
except NameError:
    def display(x): print(x)

from pyspark.sql.functions import udf, col

# Create dummy data in Spark
spark.sql("""
CREATE OR REPLACE TEMP VIEW corporate_policy AS
SELECT * FROM VALUES
    (1, 'Remote Work', 'Employees are allowed to work remotely up to 3 days a week with manager approval.'),
    (2, 'Expense Policy', 'Travel expenses under $50 do not require receipts. All international travel requires VP approval.'),
    (3, 'PTO Policy', 'Employees accrue 1.5 days of Paid Time Off (PTO) per month. Unused PTO rolls over up to 10 days.'),
    (4, 'IT Security', 'Passwords must be changed every 90 days. Multi-factor authentication (MFA) is required for all VPN access.')
AS data(id, topic, content)
""")

df_knowledge = spark.sql("SELECT * FROM corporate_policy")
display(df_knowledge)

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load Embedding Model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert to Pandas for local processing
pdf_knowledge = df_knowledge.toPandas()

# Generate Embeddings
pdf_knowledge['embedding'] = pdf_knowledge['content'].apply(lambda x: embedding_model.encode(x))
print("Embeddings generated successfully.")

In [None]:
import faiss
import numpy as np

# Stack embeddings
embeddings_matrix = np.stack(pdf_knowledge['embedding'].values)

# Build FAISS Index
dimension = embeddings_matrix.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_matrix)
print(f"Index built with {index.ntotal} vectors.")

In [None]:
def retrieve_context(user_query, k=1):
    query_vector = embedding_model.encode([user_query])
    distances, indices = index.search(query_vector, k)
    best_match_index = indices[0][0]
    retrieved_doc = pdf_knowledge.iloc[best_match_index]
    return retrieved_doc['content']

# Test
print(retrieve_context("Can I work from home?"))