# Construct Sample Dataset

In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from pyspark.sql import SparkSession

load_dotenv()

access_key = os.getenv("DIGITALOCEAN_SPACES_ACCESS_KEY")
secret_key = os.getenv("DIGITALOCEAN_SPACES_SECRET_KEY")

In [2]:
spark = SparkSession.builder \
    .appName('ai-powered-search') \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0,org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.access.key", access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", secret_key) \
    .config("spark.hadoop.fs.s3a.endpoint", "https://fra1.digitaloceanspaces.com") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.executor.memory", "12g") \
    .config("spark.driver.memory", "12g") \
    .config("spark.driver.maxResultSize", "12g") \
    .enableHiveSupport() \
    .getOrCreate()

:: loading settings :: url = jar:file:/home/michael/Github/EnteRAG/venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/michael/.ivy2/cache
The jars for the packages stored in: /home/michael/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-52cc8cc2-38ca-4965-84bc-71f9b453bb76;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.0.0 in central
	found io.delta#delta-storage;3.0.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
downloading https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar ...
	[SUCCESSFUL ] org.apache.hadoop#hadoop-aws;3.3.4!hadoop-aws.jar (16ms)
downloading https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle

In [3]:
# test spark
data = [("John", 25), ("Alice", 30), ("Bob", 28)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)
# Show the DataFrame
df.show(5)

                                                                                

+-----+---+
| Name|Age|
+-----+---+
| John| 25|
|Alice| 30|
|  Bob| 28|
+-----+---+



In [16]:
bucket_name = "michael-data-lake"
file_path = f"s3a://{bucket_name}/goodreads/20240519_embeddings.parquet"

# read parquet
books = spark.read.parquet(file_path)

In [12]:
books.show(5)

[Stage 14:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+--------------+------------------+--------------------+-----+
|               title|             authors|         description|                link|average_rating|text_reviews_count|          embeddings|index|
+--------------------+--------------------+--------------------+--------------------+--------------+------------------+--------------------+-----+
| The Face of Another|Kobo Abe;E. Dale ...|Like an elegantly...|https://www.goodr...|          3.78|               118|[-0.3300421833992...|    0|
| The Face of Another|Kobo Abe;E. Dale ...|Like an elegantly...|https://www.goodr...|          3.78|               118|[-0.0653477609157...|    1|
|The Devil's Grave...|           Anonymous|A exhilarating ad...|https://www.goodr...|          3.94|                 7|[0.34402674436569...|    2|
|The Devil's Grave...|           Anonymous|A exhilarating ad...|https://www.goodr...|          3.94|                 7

                                                                                

In [6]:
print(books.count())

1370919


                                                                                

In [17]:
# sample 5000 books
books = books.sample(fraction=5000/books.count(), seed=42)
# turn to pandas
books = books.toPandas()

                                                                                

In [18]:
books.head()

Unnamed: 0,title,authors,description,link,average_rating,text_reviews_count,embeddings,index
0,The Blood Royal (Joe Sandilands #9),Barbara Cleverly,"A story of murder, mystery and espionage (with...",https://www.goodreads.com/book/show/10215672-t...,3.62,86,"[-0.15909269452095032, -0.1371370106935501, 0....",255
1,Wedding at King's Convenience (Kings of Califo...,Maureen Child,Everyone did Jefferson King's bidding. Except ...,https://www.goodreads.com/book/show/11147472-w...,3.37,3,"[0.28028225898742676, 0.00014651630772277713, ...",1405
2,Alcibiades,Plato,The Alcibiades was widely read in antiquity as...,https://www.goodreads.com/book/show/1132944.Al...,3.82,10,"[-0.2626037299633026, 0.15822897851467133, 0.2...",1643
3,Crazy Enough: A Memoir,Storm Large,"Yes,Storm Large is her real name, though she's...",https://www.goodreads.com/book/show/11459673-c...,3.77,131,"[-0.1144125685095787, -0.3899981379508972, 0.0...",1824
4,"The Second Spy (The Books of Elsewhere, #3)",Jacqueline West,"In Olive's third adventure, what lurks below t...",https://www.goodreads.com/book/show/11737314-t...,4.29,93,"[0.2180931568145752, -0.09221331775188446, -0....",2099


In [19]:
# drop index column 
books = books.drop("index", axis=1)

# save to csv
books.to_csv("../search/books_embeddings.csv", index=False)