# 00 Setup

- Adds parent folder of `src/` to the path. In calling code, import from `src/` with:
    - `from src import <module>`
    - `from src.mymodule import myfunc`
- Starts up spark session in local environment only (not needed in Databricks)
- Sets random seed for all operations in the driver
      

In [0]:
import os
import sys
from pathlib import Path
import numpy as np
import random

In [0]:
random.seed(0)
np.random.seed(0)
print("üé≤ Set random.seed(0) and np.random.seed(0)")

In [0]:
# Add src to path
parent_path = str(Path(os.getcwd()).parent)
if parent_path not in sys.path:
    sys.path.insert(0, parent_path)

print(f"‚úÖ Added {parent_path} to path")
print("Ready to import from src/")

In [0]:
def create_spark_session():
    from pyspark.sql import SparkSession
    from delta import configure_spark_with_delta_pip
    from src.constants import (LOCAL_DATA_PATH, 
                               LOCAL_DATA_LAKE_PATH,
                               LOCAL_DATA_METASTORE_PATH)
    
    # Mirror your cloud naming locally
    local_data_path = Path(LOCAL_DATA_PATH)
    data_lake_dir = Path(LOCAL_DATA_LAKE_PATH)
    metastore_dir = Path(LOCAL_DATA_METASTORE_PATH)

    # Ensure directories exist
    data_lake_dir.mkdir(parents=True, exist_ok=True)
    metastore_dir.mkdir(parents=True, exist_ok=True)

    # The configs for spark.hadoop.hive and spark.hadoop.datanucleus
    # are added to satisfy and silence benign but annoying warnings
    
    builder = SparkSession.builder \
        .appName("spelling-bee-solver-training-LOCAL") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.sql.warehouse.dir", str(data_lake_dir)) \
        .config("spark.hadoop.javax.jdo.option.ConnectionURL", f"jdbc:derby:{metastore_dir}/metastore_db;create=true") \
        .config("spark.hadoop.hive.stats.jdbc.timeout", "30") \
        .config("spark.hadoop.hive.stats.retries.wait", "3000") \
        .config("spark.hadoop.hive.metastore.schema.verification", "false") \
        .config("spark.hadoop.hive.metastore.schema.verification.record.version", "true") \
        .config("spark.hadoop.datanucleus.autoCreateSchema", "true") \
        .config("spark.hadoop.datanucleus.schema.autoCreateTables", "true") \
        .config("spark.databricks.delta.optimizeWrite.enabled", "true") \
        .config("spark.sql.parquet.compression.codec", "gzip") \
        .enableHiveSupport() 

    print("Initializing Spark (this will be verbose for several seconds)...")
    spark = configure_spark_with_delta_pip(builder).getOrCreate()
    print("Spark initialized! Future operations will be much quieter.")

    spark.sparkContext.setLogLevel("ERROR")

    return spark

In [0]:
# Start spark session for local environment only
from src.envutils import is_databricks_env

if not is_databricks_env():   
    spark = create_spark_session()
    print(f"‚úÖ Got or created spark session for local environment")

In [0]:
# Config for this notebook, possibly local only
if not is_databricks_env():
    print("üéõÔ∏è updating spark config for this notebook, reducing columnarReaderBatchSize to 1024...")
    spark.conf.set("spark.sql.parquet.columnarReaderBatchSize", "1024")