# Dataset builder
> Create and store on GCS random vectors

## Spark cluster info

In [None]:
# Check number of workers VM
sc = spark._jsc.sc()
n_workers =  len([executor.host() for executor in sc.statusTracker().getExecutorInfos() ]) -1
n_workers

In [None]:
# Check spark version
spark

## Inputs data

In [None]:
# Replace the following variables with your values:
EMB_N = 1_000_000
EMB_DIM = 768
EMB_PATH = "gs://test-project-bucket/testzone/20231015-rnd/"

---

## Generate mock data

In [None]:
data = [n for n in range(EMB_N)]

In [None]:
from pyspark.sql import types as t
from pyspark.sql.functions import lit, udf

df = spark.createDataFrame(data, t.IntegerType()).toDF("emb_id")
df = df.withColumn("embedding", lit(None))

In [None]:
num_partitions = 6 # Adjust this value based on your resources
df = df.repartition(num_partitions)

In [None]:
import random
from math import sqrt

def generate_vector(_empty_value):
    vector = [random.random() for _ in range(EMB_DIM)]
    norm = sqrt(sum(x**2 for x in vector))
    return [x / norm for x in vector]

generate_vector_udf = udf(generate_vector, t.ArrayType(t.DoubleType()))
df = df.withColumn("embedding", generate_vector_udf(df["embedding"]))

In [None]:
df.write.mode('overwrite').parquet(EMB_PATH)

---

## ~ end