#### Reading Json using spark

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
import pandas as pd
import numpy as np

In [3]:
spark = SparkSession.builder.appName('JSON Data').getOrCreate()

In [4]:
shows1 = spark.read.json('./DataAnalysisPythonPySpark/data/data/shows/shows-breaking-bad.json')
shows2 = spark.read.json('./DataAnalysisPythonPySpark/data/data/shows/shows-golden-girls.json')
shows3 = spark.read.json('./DataAnalysisPythonPySpark/data/data/shows/shows-silicon-valley.json')
# shows = spark.read.json('./DataAnalysisPythonPySpark/data/data/shows/shows-*.json', multiLine=True)

In [5]:
shows1.count()
shows2.count()
shows3.count()

1

In [8]:
shows1.select("name", "genres").show(1, False)
shows2.select("name", "genres").show(1, False)
shows3.select("name", "genres").show(1, False)

print("Columns: ", shows1.columns)
print("Columns: ", shows2.columns)
print("Columns: ", shows3.columns)

+------------+------------------------+
|name        |genres                  |
+------------+------------------------+
|Breaking Bad|[Drama, Crime, Thriller]|
+------------+------------------------+

+----------------+---------------+
|name            |genres         |
+----------------+---------------+
|The Golden Girls|[Drama, Comedy]|
+----------------+---------------+

+--------------+--------+
|name          |genres  |
+--------------+--------+
|Silicon Valley|[Comedy]|
+--------------+--------+

Columns:  ['_embedded', '_links', 'externals', 'genres', 'id', 'image', 'language', 'name', 'network', 'officialSite', 'premiered', 'rating', 'runtime', 'schedule', 'status', 'summary', 'type', 'updated', 'url', 'webChannel', 'weight']
Columns:  ['_embedded', '_links', 'externals', 'genres', 'id', 'image', 'language', 'name', 'network', 'officialSite', 'premiered', 'rating', 'runtime', 'schedule', 'status', 'summary', 'type', 'updated', 'url', 'webChannel', 'weight']
Columns:  ['_embedde

In [25]:
shows1.select('name', 'genres', 'image', 'summary').show()

+------------+--------------------+--------------------+--------------------+
|        name|              genres|               image|             summary|
+------------+--------------------+--------------------+--------------------+
|Breaking Bad|[Drama, Crime, Th...|{http://static.tv...|<p><b>Breaking Ba...|
+------------+--------------------+--------------------+--------------------+



In [46]:
from pyspark.sql import DataFrame

In [65]:
# [_[0] == _[1] for _ in zip(shows1.dtypes,shows2.dtypes)]
print(shows3.dtypes[19],shows2.dtypes[19])

('webChannel', 'string') ('webChannel', 'string')


In [66]:
shows = shows3.union(shows2)
print("shows1 and shows2 joinned")
# show2 = shows1.union(shows3)
print("shows1 and shows3 joinned")
# shows = show1.union(show2)
print("show1 and show2 joinned")
shows.count()
# shows3.printSchema()

shows1 and shows2 joinned
shows1 and shows3 joinned
show1 and show2 joinned


2

In [71]:
# shows.select('name','genres').show()
array_subset = shows.select('name','genres')
array_subset.show()

+----------------+---------------+
|            name|         genres|
+----------------+---------------+
|  Silicon Valley|       [Comedy]|
|The Golden Girls|[Drama, Comedy]|
+----------------+---------------+



In [72]:
array_subset = array_subset.select(
    'name',
    array_subset.genres[0].alias('dot_and_index'),
    F.col('genres')[0].alias('col_and_index'),
    array_subset.genres.getItem(0).alias('dot_and_method'),
    F.col('genres').getItem(0).alias('col_and_method')
)
array_subset.show(5)

+----------------+-------------+-------------+--------------+--------------+
|            name|dot_and_index|col_and_index|dot_and_method|col_and_method|
+----------------+-------------+-------------+--------------+--------------+
|  Silicon Valley|       Comedy|       Comedy|        Comedy|        Comedy|
|The Golden Girls|        Drama|        Drama|         Drama|         Drama|
+----------------+-------------+-------------+--------------+--------------+



In [80]:
# Multiple operation on array column
array_subset_repeat = array_subset.select(
    'name',
    F.lit('Comedy').alias('one'),
    F.lit('Horror').alias('two'),
    F.lit('Drama').alias('three'),
    F.col('dot_and_index'),
).select(
    'name',
    F.array('one', 'two', 'three').alias("Some_Genres"),
    F.array_repeat('dot_and_index', 5).alias('Repeated_Genres')
)

array_subset_repeat.show(truncate=False)
array_subset_repeat.select(
    'name', F.size('Some_Genres'), F.size('Repeated_Genres')
).show(truncate=False)
print("\nDistinct array elements.")
array_subset_repeat.select(
    'name', F.array_distinct('Some_Genres'),
    F.array_distinct('Repeated_Genres')
).show(truncate=False)

+----------------+-----------------------+----------------------------------------+
|name            |Some_Genres            |Repeated_Genres                         |
+----------------+-----------------------+----------------------------------------+
|Silicon Valley  |[Comedy, Horror, Drama]|[Comedy, Comedy, Comedy, Comedy, Comedy]|
|The Golden Girls|[Comedy, Horror, Drama]|[Drama, Drama, Drama, Drama, Drama]     |
+----------------+-----------------------+----------------------------------------+

+----------------+-----------------+---------------------+
|name            |size(Some_Genres)|size(Repeated_Genres)|
+----------------+-----------------+---------------------+
|Silicon Valley  |3                |5                    |
|The Golden Girls|3                |5                    |
+----------------+-----------------+---------------------+


Distinct array elements.
+----------------+---------------------------+-------------------------------+
|name            |array_distinct(So

In [82]:
# Intersection of columns elements
array_subset_repeat = array_subset_repeat.select(
    'name', F.array_intersect('Some_Genres', 'Repeated_Genres').alias('Genres')
)
array_subset_repeat.show()

+----------------+--------+
|            name|  Genres|
+----------------+--------+
|  Silicon Valley|[Comedy]|
|The Golden Girls| [Drama]|
+----------------+--------+

