In [None]:
from pyspark.sql.functions import acos, cos, radians, sin, lit
from pyspark.sql.types import *
import pyspark 
from delta import *
from delta.tables import DeltaTable

def main():    
    # Constants
    LATITUDE = 40.7128
    LONGITUDE = -74.0060
    TABLE_PATH = "hdfs:///project/data/business_data/delta_table"
    CSV_PATH = "hdfs:///project/data/business_data/yelp_academic_dataset_business_16x.csv"
    
    # Initiate spark
    builder = pyspark.sql.SparkSession.builder.appName("spark_distance") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    spark = configure_spark_with_delta_pip(builder).getOrCreate()

    # Create dataframe from csv file
    schema = "Business_id STRING, Latitude FLOAT, Longitude FLOAT, Stars FLOAT"
    df = spark.read.csv(CSV_PATH, schema=schema)

    # Calculate haversine distances
    df = df.withColumn("Distance",
                   acos(sin(radians(lit(LATITUDE))) * sin(radians("Latitude")) +
                        cos(radians(lit(LATITUDE))) * cos(radians("Latitude")) *
                        cos(radians("Longitude") - radians(lit(LONGITUDE)))) * 6371)
    
    # If no delta table exists, save and exit
    if not DeltaTable.isDeltaTable(spark, TABLE_PATH):
        df.write.format("delta").save(TABLE_PATH)
        return
        
    # Upsert delta table
    delta_table = DeltaTable.forPath(spark, TABLE_PATH)
    delta_table.alias("old") \
        .merge(
            df.alias("new"),
            "old.Business_id = new.Business_id"
        ) \
        .whenMatchedUpdate(set=
            {
                "Business_id": "new.Business_id",
                "Latitude": "new.Latitude",
                "Longitude": "new.Longitude",
                "Stars": "new.Stars",
                "Distance": "new.Distance"
            }
        ) \
        .whenNotMatchedInsert(values=
            {
                "Business_id": "new.Business_id",
                "Latitude": "new.Latitude",
                "Longitude": "new.Longitude",
                "Stars": "new.Stars",
                "Distance": "new.Distance"
            }
        ) \
        .execute()


if __name__ == "__main__":
    main()