In [14]:
#Query 3 Sedona for LA
from sedona.spark import *
from pyspark.sql.functions import col, broadcast, date_format, to_timestamp, round, format_number, concat, lit, when, count, sum, regexp_replace, trim
from pyspark.sql import SparkSession

# Create spark Session
spark = SparkSession.builder \
    .appName("GeoJSON read") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type") \
            .filter(col("CITY") == "Los Angeles")

#Query 3 Initialisation with Dataframe
start_time = time.time()

# Crime dataframe with parquet for quicker esults
crime_df = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group12/main_dataset_parquet")\
                .filter((col("LON") != 0) | (col("LAT") != 0))  \
                .withColumn("point",ST_Point("LON", "LAT")) \
                .select("point")

# Income dataframe with median income
income_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv",
    header=True,
    inferSchema=True
)

res_income = income_df.withColumn("Estimated_Median_Income",
    regexp_replace(trim(col("Estimated Median Income")), "[^0-9]", "").cast(IntegerType())
    ).drop("Community","Estimated Median Income")

# Filter population for total population anf houses
zip_comm_population = flattened_df.select("ZCTA10", "COMM", "POP_2010", "HOUSING10", "geometry") \
    .filter((col("ZCTA10") > 0) & (col("POP_2010") > 0) & (col("HOUSING10") > 0) & (trim(col("COMM")) != "")) \
    .groupBy("ZCTA10", "COMM") \
    .agg(
        sum("POP_2010").alias("Total_Population_Zip_COMM"),
        sum("HOUSING10").alias("Total_Housing"),
        ST_Union_Aggr("geometry").alias("geometry")
    )
    
# Join incoma and cansus using zip coded

joined_income_df = zip_comm_population.join(
   res_income,      #BROADCAST
    zip_comm_population["ZCTA10"] == res_income["Zip Code"],
    "inner"  
    ) \
    .drop("ZCTA10","Zip Code") \
    .withColumn("Total income",col("Total_Housing") * col("Estimated_Median_Income")) \
    .drop("Estimated_Median_Income") \
    .groupBy("COMM") \
    .agg(sum("Total_Population_Zip_COMM")
    .alias("Total_Population"),sum("Total income")
    .alias("Total Income"),ST_Union_Aggr("geometry")
    .alias("geometry")) 


results = joined_income_df.join(
                crime_df,    #BROADCAST
                ST_Within(crime_df.point, 
                joined_income_df.geometry), "inner") \
                .groupBy("COMM","Total_Population","Total Income") \
                .agg(count(col("*")).alias("Total crimes")) \
                .withColumn("crime_rate",format_number((col("Total crimes")/col("Total_Population")),4)) \
                .withColumn("estimated_median_income",concat(lit("$"),format_number(round(col("Total Income") / col("Total_Population")), 0))) \
                .drop("Total Income","Total_Population","Total crimes") \
                .withColumnRenamed("COMM", "Location") \
                .orderBy(col("crime_rate").desc())
results.show() 
end_time = time.time()
# End timing
print(f"Execution Time using BROADCAST : {end_time - start_time} seconds")



An error was encountered:
Session 3643 unexpectedly reached final status 'dead'. See logs:
stdout: 

stderr: 
25/01/19 14:30:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/19 14:30:48 INFO DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at ip-192-168-1-36.eu-central-1.compute.internal/192.168.1.36:8032
25/01/19 14:30:49 INFO Configuration: resource-types.xml not found
25/01/19 14:30:49 INFO ResourceUtils: Unable to find 'resource-types.xml'.
25/01/19 14:30:49 INFO Client: Verifying our application has not requested more than the maximum memory capability of the cluster (12288 MB per container)
25/01/19 14:30:49 INFO Client: Will allocate AM container, with 1384 MB memory including 384 MB overhead
25/01/19 14:30:49 INFO Client: Setting up container launch context for our AM
25/01/19 14:30:49 INFO Client: Setting up the launch environment for our AM container
25/01/19 14:30:49 INFO Cl

In [None]:
#Hint BROADCAST and Explain

In [None]:
#Hint MERGE and Explain

In [None]:
#Hint SHUFFLE_HASH and Explain

In [None]:
#Hint Broadcast and Explain