# Quick Tutorial of json operations

In [12]:
# you want to import json
import json

# this is a dictionary, python dictionary
demo_data = {"name": "Bob","languages": ["English", "Fench"],"married": True,"age": 32}

demo_json_path = './data/json_demo.json'

# this shows how to save python object to json file
with open(demo_json_path, 'w') as f:
    json.dump(demo_data, f)

In [13]:
!cat ./data/json_demo.json

{"name": "Bob", "languages": ["English", "Fench"], "married": true, "age": 32}

In [14]:
# here shows how to read json file and create python object
with open(demo_json_path, 'r') as f:
    json_data = json.load(f)
    print(json_data)

{'name': 'Bob', 'languages': ['English', 'Fench'], 'married': True, 'age': 32}


In [16]:
type(json_data)

dict

# Look at the data

In [17]:
!cat ./data/people.json

{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}

# Create SparkSession

In [18]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# JSON to Spark DataFrame         
## Semi-structured Data
## Create SqlContext

In [19]:
# read json file into a DataFrame
from pyspark import SQLContext

#Start an sqlContext
sqlContext = SQLContext.getOrCreate(spark.sparkContext)

In [20]:
#Read Json
people_df = sqlContext.read.json("./data/people.json")

In [26]:
people_df.rdd.collect()

[Row(age=None, name='Michael'),
 Row(age=30, name='Andy'),
 Row(age=19, name='Justin')]

In [30]:
sc = spark.sparkContext

# this is python object
people_python_object = [{'age':None, 'name':'Michael'},
 {'age':30, 'name':'Andy'},
 {'age':19, 'name':'Justin'}]
print('this is python object', people_python_object)

# this Spark RDD, create rdd from python object
people_rdd = sc.parallelize(people_python_object)
print('this is a rdd in slave node', people_rdd)
print('this is all the rdd data', people_rdd.collect())

# create dataframe from RDD
people_df = people_rdd.toDF(['age','name'])
people_df.show()

# get rdd from dataframe
people_df_rdd = people_df.rdd

this is python object [{'age': None, 'name': 'Michael'}, {'age': 30, 'name': 'Andy'}, {'age': 19, 'name': 'Justin'}]
this is a rdd in slave node ParallelCollectionRDD[38] at parallelize at PythonRDD.scala:195
this is all the rdd data [{'age': None, 'name': 'Michael'}, {'age': 30, 'name': 'Andy'}, {'age': 19, 'name': 'Justin'}]




+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [31]:
# Before you do sql query, create TempView and give it a name
people_df.createOrReplaceTempView("people_table")

#Perform SQL Query, this is the simplest SQL query
all_people_df = spark.sql("SELECT * FROM people_table WHERE age > 20")
all_people_df.show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



# RDD to Spark DataFrame

In [33]:
sc = spark.sparkContext
input_data_path = '../week1/big_data_intro.txt'
text_file = sc.textFile(input_data_path)
counts_rdd = text_file.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b)

counts_df = counts_rdd.toDF(['word','count'])
counts_df.show(100, False)

+---------------+-----+
|word           |count|
+---------------+-----+
|is             |59   |
|term           |7    |
|non-traditional|1    |
|needed         |2    |
|gather         |2    |
|organize       |2    |
|process        |15   |
|large          |18   |
|datasets       |11   |
|of             |111  |
|working        |4    |
|power          |1    |
|storage        |9    |
|single         |7    |
|new            |4    |
|value          |5    |
|this           |13   |
|type           |5    |
|in             |41   |
|years          |1    |
|               |38   |
|we             |7    |
|fundamental    |1    |
|common         |4    |
|concepts       |3    |
|researching    |1    |
|subject        |1    |
|take           |3    |
|high-level     |1    |
|look           |2    |
|at             |8    |
|processes      |4    |
|used           |13   |
|space          |2    |
|What           |2    |
|Data?          |1    |
|down           |2    |
|projects       |6    |
|vendors        

In [38]:
counts_df.createOrReplaceTempView("count_table")
filtered_count_df = spark.sql("SELECT * FROM count_table WHERE count > 20")
filtered_count_df.show()

+----------+-----+
|      word|count|
+----------+-----+
|        is|   59|
|        of|  111|
|        in|   41|
|          |   38|
|       are|   35|
|   systems|   24|
|    Apache|   22|
|      data|  126|
|         a|   66|
|       for|   42|
|       the|  145|
|       and|  103|
|        to|   87|
|      with|   26|
|      that|   32|
| computing|   21|
|       big|   35|
|processing|   26|
|        be|   28|
|       can|   33|
+----------+-----+
only showing top 20 rows

