# Spark and Hadoop MPP Demonstration
Peter Benzoni

### Load Assessments data and print initial info

### Step 1 - Load JSON Data; Step 2 – Pretty Print First JSON Object

In [None]:
from pyspark.sql import SparkSession
from pprint import pprint

spark = SparkSession.builder.appName("MPP Demo").getOrCreate()
assessments_1 = spark.read.json("./data/assessments.json")
assessments_1.show()

first_record = assessments_1.first()
pprint(first_record.asDict(), indent=3)

### Step 3 – Recursive Walk First JSON Object

In [None]:
def recursive_walk(obj, indent=0):
    if isinstance(obj, dict):
        for k, v in obj.items():
            print(' ' * indent + k + ':')
            recursive_walk(v, indent+3)
    elif isinstance(obj, list):
        for i, item in enumerate(obj):
            print(' ' * indent + 'List item ' + str(i) + ':')
            recursive_walk(item, indent+3)
    else:
        print(' ' * indent + str(obj))
recursive_walk(first_record.asDict())

### Step 4 - Demonstrate MPP Spark Transform


In [None]:
assessments_2 = assessments_1.filter(assessments_1.submitted == 1)

### Step 5 – Spark SQL in Memory Query

In [None]:
assessments_2.createOrReplaceTempView("temp_assessments")
assessments_3 = spark.sql("SELECT * FROM temp_assessments WHERE submitted = 1")

### Step 6 – Write to Hadoop HDFS Parquet

In [None]:
assessments_3.write.parquet("hdfs://hadoop:9000/user/root/assessments_parquet")

### Step 7 – Spark SQL Query against Hadoop HDFS

In [None]:
assessments_df = spark.read.parquet("hdfs://hadoop:9000/user/root/assessments_parquet")
assessments_df.createOrReplaceTempView("assessments")
result = spark.sql("SELECT * FROM assessments WHERE submitted = 1")

### Step 8 – MPP Read from Hadoop HDFS Parquet Columnar Table

In [None]:
assessments = spark.read.parquet("hdfs://hadoop:9000/user/root/assessments_parquet")
assessments.show()