# Example 37: Product Search

Indexing catalog for faceted search.

## 1. Environment Setup (Spark, ES, Trino)

In [None]:
# 1. Install Java 17 (Required for Trino)
!apt-get install openjdk-17-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"

# 2. Install PySpark & Dependencies
!pip install -q pyspark==3.5.0 elasticsearch==8.11.0

# 3. Download & Start Elasticsearch 7.17 (Background)
%%bash
if [ ! -d "elasticsearch-7.17.9" ]; then
  wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.17.9-linux-x86_64.tar.gz
  tar -xzf elasticsearch-7.17.9-linux-x86_64.tar.gz
  chown -R daemon:daemon elasticsearch-7.17.9
fi
# Start as daemon
sudo -u daemon ./elasticsearch-7.17.9/bin/elasticsearch -d -E discovery.type=single-node -E http.port=9200 -E xpack.security.enabled=false

# 4. Download & Start Trino (Presto) Server
if [ ! -d "trino-server-422" ]; then
  wget -q https://repo1.maven.org/maven2/io/trino/trino-server/422/trino-server-422.tar.gz
  tar -xzf trino-server-422.tar.gz
  # Config
  mkdir -p trino-server-422/etc/catalog
  echo 'coordinator=true' > trino-server-422/etc/node.properties
  echo 'node-scheduler.include-coordinator=true' >> trino-server-422/etc/node.properties
  echo 'http-server.http.port=8080' >> trino-server-422/etc/node.properties
  echo 'query.max-memory=5GB' >> trino-server-422/etc/node.properties
  echo 'discovery.uri=http://127.0.0.1:8080' > trino-server-422/etc/config.properties
  # JVM Config
  echo '-server' > trino-server-422/etc/jvm.config
  echo '-Xmx2G' >> trino-server-422/etc/jvm.config
  # Elasticsearch Catalog
  echo 'connector.name=elasticsearch' > trino-server-422/etc/catalog/es.properties
  echo 'elasticsearch.host=localhost' >> trino-server-422/etc/catalog/es.properties
  echo 'elasticsearch.port=9200' >> trino-server-422/etc/catalog/es.properties
  echo 'elasticsearch.default-schema-name=default' >> trino-server-422/etc/catalog/es.properties
  # TPCH Catalog
  echo 'connector.name=tpch' > trino-server-422/etc/catalog/tpch.properties
fi
# Start Trino
./trino-server-422/bin/launcher start

# 5. Install Trino CLI
if [ ! -f "trino" ]; then
  wget -q https://repo1.maven.org/maven2/io/trino/trino-cli/422/trino-cli-422-executable.jar -O trino
  chmod +x trino
fi

print("Environment Setup Complete. Waiting for services to startup...")
import time
time.sleep(30) # Wait for ES and Trino

In [None]:
from pyspark.sql import SparkSession
import random

!wget -q -nc https://repo1.maven.org/maven2/org/elasticsearch/elasticsearch-spark-30_2.12/8.11.0/elasticsearch-spark-30_2.12-8.11.0.jar
spark = SparkSession.builder.config("spark.driver.extraClassPath", "elasticsearch-spark-30_2.12-8.11.0.jar").getOrCreate()

products = []
categories = ["Electronics", "Clothing", "Home"]
for i in range(50):
    products.append({
        "sku": f"SKU-{i}",
        "name": f"Product {i}",
        "category": random.choice(categories),
        "price": float(random.randint(10, 500))
    })

df = spark.createDataFrame(products)
df.write.format("org.elasticsearch.spark.sql").option("es.nodes","localhost").option("es.resource","products/catalog").mode("overwrite").save()
print("Catalog indexed.")

In [None]:
# Query expensive electronics
!./trino --server localhost:8080 --catalog es --schema default --execute "SELECT name, price FROM products WHERE category='Electronics' AND price > 200"