# Quick Tutorial of json operations

In [1]:
import json

demo_data = {"name": "Bob","languages": ["English", "Fench"],"married": True,"age": 32}
demo_json_path = './data/json_demo.json'
with open(demo_json_path, 'w') as f:
    json.dump(demo_data, f)

In [2]:
!cat ./data/json_demo.json

{"name": "Bob", "languages": ["English", "Fench"], "married": true, "age": 32}

In [3]:
with open(demo_json_path, 'r') as f:
    json_data = json.load(f)
    print(json_data)
    

{'name': 'Bob', 'languages': ['English', 'Fench'], 'married': True, 'age': 32}


# Look at the data

In [4]:
!cat ./data/people.json

{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}

# Create SparkContext

In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# JSON to Spark DataFrame         
## Semi-structured Data

In [6]:
# read json file into a DataFrame
from pyspark import SQLContext

#Start an sqlContext
sqlContext = SQLContext.getOrCreate(spark.sparkContext)

In [7]:
#Read Json
people_df = sqlContext.read.json("./data/people.json")

In [8]:
people_df.createOrReplaceTempView("people_table")

#Perform SQL Query
all_people_df = spark.sql("SELECT * FROM people_table")
all_people_df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



# RDD to Spark DataFrame

In [10]:
sc = spark.sparkContext
input_data_path = '../week1/big_data_intro.txt'
text_file = sc.textFile(input_data_path)
counts_rdd = text_file.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b)

counts_df = counts_rdd.toDF(['name','age'])
counts_df.show(10, False)

+---------------+---+
|name           |age|
+---------------+---+
|is             |59 |
|term           |7  |
|non-traditional|1  |
|needed         |2  |
|gather         |2  |
|organize       |2  |
|process        |15 |
|large          |18 |
|datasets       |11 |
|of             |111|
+---------------+---+
only showing top 10 rows

