In [44]:
# Spark init
import pyspark
from pyspark.sql import SparkSession
mongo_uri = "mongodb://admin:mongopw@mongo:27017/admin?authSource=admin"

spark = SparkSession \
    .builder \
    .master("local") \
    .appName('jupyter-pyspark') \
      .config("spark.mongodb.input.uri", mongo_uri) \
      .config("spark.mongodb.output.uri", mongo_uri) \
      .config("spark.jars.packages","org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [45]:
#1 
df = spark.read.json("file:///home/jovyan/datasets/json-samples/US-Senators.json")
df.write.format("mongo").mode("overwrite").option("database","labf").option("collection","senators").save()
df.printSchema()

root
 |-- caucus: string (nullable = true)
 |-- congress_numbers: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- current: boolean (nullable = true)
 |-- description: string (nullable = true)
 |-- district: string (nullable = true)
 |-- enddate: string (nullable = true)
 |-- extra: struct (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- contact_form: string (nullable = true)
 |    |-- end-type: string (nullable = true)
 |    |-- fax: string (nullable = true)
 |    |-- how: string (nullable = true)
 |    |-- office: string (nullable = true)
 |    |-- rss_url: string (nullable = true)
 |-- leadership_title: string (nullable = true)
 |-- party: string (nullable = true)
 |-- person: struct (nullable = true)
 |    |-- bioguideid: string (nullable = true)
 |    |-- birthday: string (nullable = true)
 |    |-- cspanid: long (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- gend

In [51]:
df = spark.read.format("mongo").option("database","labf").option("collection","senators").load()

df.filter( df.senator_rank == "junior").select("person").printSchema()

root
 |-- person: struct (nullable = true)
 |    |-- bioguideid: string (nullable = true)
 |    |-- birthday: string (nullable = true)
 |    |-- cspanid: long (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- gender_label: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |    |-- link: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- namemod: string (nullable = true)
 |    |-- nickname: string (nullable = true)
 |    |-- osid: string (nullable = true)
 |    |-- pvsid: string (nullable = true)
 |    |-- sortname: string (nullable = true)
 |    |-- twitterid: string (nullable = true)
 |    |-- youtubeid: string (nullable = true)



In [60]:
df.select("party").distinct().explain()


== Physical Plan ==
*(2) HashAggregate(keys=[party#2054], functions=[])
+- Exchange hashpartitioning(party#2054, 200), ENSURE_REQUIREMENTS, [id=#1083]
   +- *(1) HashAggregate(keys=[party#2054], functions=[])
      +- *(1) Scan MongoRelation(MongoRDD[402] at RDD at MongoRDD.scala:51,Some(StructType(StructField(_id,StructType(StructField(oid,StringType,true)),true), StructField(caucus,StringType,true), StructField(congress_numbers,ArrayType(LongType,true),true), StructField(current,BooleanType,true), StructField(description,StringType,true), StructField(enddate,StringType,true), StructField(extra,StructType(StructField(address,StringType,true), StructField(contact_form,StringType,true), StructField(end-type,StringType,true), StructField(fax,StringType,true), StructField(how,StringType,true), StructField(office,StringType,true), StructField(rss_url,StringType,true)),true), StructField(leadership_title,StringType,true), StructField(party,StringType,true), StructField(person,StructType(Str

In [61]:
df.createOrReplaceTempView("senators")
query = "select distinct party from senators"
spark.sql(query).explain()

== Physical Plan ==
*(2) HashAggregate(keys=[party#2054], functions=[])
+- Exchange hashpartitioning(party#2054, 200), ENSURE_REQUIREMENTS, [id=#1103]
   +- *(1) HashAggregate(keys=[party#2054], functions=[])
      +- *(1) Scan MongoRelation(MongoRDD[402] at RDD at MongoRDD.scala:51,Some(StructType(StructField(_id,StructType(StructField(oid,StringType,true)),true), StructField(caucus,StringType,true), StructField(congress_numbers,ArrayType(LongType,true),true), StructField(current,BooleanType,true), StructField(description,StringType,true), StructField(enddate,StringType,true), StructField(extra,StructType(StructField(address,StringType,true), StructField(contact_form,StringType,true), StructField(end-type,StringType,true), StructField(fax,StringType,true), StructField(how,StringType,true), StructField(office,StringType,true), StructField(rss_url,StringType,true)),true), StructField(leadership_title,StringType,true), StructField(party,StringType,true), StructField(person,StructType(Str

In [25]:
from pyspark.sql.functions import explode, col, lit, struct
x = df.select("extra",explode(df.congress_numbers).alias("cong_number") )
y = x.select("extra.office", "extra.fax", "cong_number", "extra")
# this code makes a struct callled "test" then adds 2 columns "fax" and "addr" lines 5-8
y = y.withColumn("test", struct( 
    (col("extra.fax")).alias("fax"), 
    (col("extra.address")).alias("addr"))
)
y.printSchema()
y.show()

root
 |-- office: string (nullable = true)
 |-- fax: string (nullable = true)
 |-- cong_number: long (nullable = true)
 |-- extra: struct (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- contact_form: string (nullable = true)
 |    |-- end-type: string (nullable = true)
 |    |-- fax: string (nullable = true)
 |    |-- how: string (nullable = true)
 |    |-- office: string (nullable = true)
 |    |-- rss_url: string (nullable = true)
 |-- test: struct (nullable = false)
 |    |-- fax: string (nullable = true)
 |    |-- addr: string (nullable = true)

+--------------------+------------+-----------+--------------------+--------------------+
|              office|         fax|cong_number|               extra|                test|
+--------------------+------------+-----------+--------------------+--------------------+
|239 Dirksen Senat...|202-228-1375|        115|{239 Dirksen Sena...|{202-228-1375, 23...|
|239 Dirksen Senat...|202-228-1375|        116|{239 Dirksen 

In [26]:
#4
nfcan = spark.read.option("multiline",True).json("file:///home/jovyan/datasets/netflix-canceled-2021/*.json")
nfcan.write.format("mongo").mode("overwrite").option("database","labf").option("collection","nfcan").save()

                                                                                

In [28]:
nfcan = spark.read.format("mongo").option("database","labf").option("collection","nfcan").load()
nfcan.printSchema()

root
 |-- _embedded: struct (nullable = true)
 |    |-- episodes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _links: struct (nullable = true)
 |    |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- airdate: string (nullable = true)
 |    |    |    |-- airstamp: string (nullable = true)
 |    |    |    |-- airtime: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- medium: string (nullable = true)
 |    |    |    |    |-- original: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- number: long (nullable = true)
 |    |    |    |-- rating: struct (nullable = true)
 |    |    |    |    |-- average: double (nullable = true)
 |    |    |    |-- runtime: long (nullable = true)
 |    |    |    |-- season: long (nullabl

In [67]:
from pyspark.sql.functions import col, explode
#nfcan.printSchema()
# of show name, season number, episode, number, 
# episode name, airdate, and average rating (for the episode).
tmp = nfcan.select( col("name").alias("showname"),explode("_embedded.episodes").alias("episode"))
eps = tmp.select("showname", col("episode.name").alias("epname"), \
                 "episode.season", "episode.number", "episode.airdate", "episode.rating.average")

nfcan.select( col("name").alias("showname")).show()
tmp.show()
eps.show()


In [73]:
nfcan.select( col("name").alias("showname")).show()
tmp.show()
eps.show()


+--------------------+
|            showname|
+--------------------+
|      Peaky Blinders|
|   Kim's Convenience|
|         On My Block|
|    The Last Kingdom|
|        Mr. Iglesias|
|            The Crew|
|              Cursed|
|             Special|
|            #blackAF|
|    Jupiter's Legacy|
|Dad Stop Embarras...|
|             Bonding|
|     Country Comfort|
|        Cowboy Bebop|
|          Zero Chill|
|Julie and the Pha...|
|      The Irregulars|
|          Grand Army|
|         The Duchess|
+--------------------+

+--------------+--------------------+
|      showname|             episode|
+--------------+--------------------+
|Peaky Blinders|{{{https://api.tv...|
|Peaky Blinders|{{{https://api.tv...|
|Peaky Blinders|{{{https://api.tv...|
|Peaky Blinders|{{{https://api.tv...|
|Peaky Blinders|{{{https://api.tv...|
|Peaky Blinders|{{{https://api.tv...|
|Peaky Blinders|{{{https://api.tv...|
|Peaky Blinders|{{{https://api.tv...|
|Peaky Blinders|{{{https://api.tv...|
|Peaky Blinder

In [64]:
# query to get the lowest rated episodes of each season for the cancelled shows. 
# Display show name, season number, episode number, episode name, and rating for that episode. 
eps.createOrReplaceTempView("eps")

In [65]:
query = '''
with source as (
    select showname, epname, season, number, airdate, average,
        min(average) over (partition by showname, season) as lowest_rated_in_season
    from eps
)
select * from source where lowest_rated_in_season = average
order by showname, season
'''
df = spark.sql(query)

In [66]:
df.show()

+--------------------+--------------------+------+------+----------+-------+----------------------+
|            showname|              epname|season|number|   airdate|average|lowest_rated_in_season|
+--------------------+--------------------+------+------+----------+-------+----------------------+
|            #blackAF|  because of slavery|     1|     1|2020-04-17|    5.3|                   5.3|
|            #blackAF|because of slaver...|     1|     2|2020-04-17|    5.3|                   5.3|
|             Bonding|      Into the Woods|     1|     7|2019-04-24|    8.0|                   8.0|
|             Bonding|Old Friends, New ...|     1|     1|2019-04-24|    8.0|                   8.0|
|             Bonding|            Penguins|     1|     6|2019-04-24|    8.0|                   8.0|
|             Bonding|Stand Me Up, Stan...|     2|     7|2021-01-27|    7.5|                   7.5|
|             Bonding|          Permission|     2|     8|2021-01-27|    7.5|                   7.5|


                                                                                

In [39]:
from IPython.display import display, HTML, Image,YouTubeVideo
from ipywidgets import interact, interact_manual, widgets

In [41]:
display(HTML("<h1>Netflix Cancelled Shows of 2021</h1>"))
shows = nfcan.select("name").distinct().sort("name").toPandas()["name"].values
listwidget =widgets.Select(
    options=shows,
    value='Bonding',
    # rows=10,
    description='Pick A Show:',
    disabled=False
)

@interact(show=listwidget)
def onchange(show):
    info = nfcan.select("name","summary", "image.medium", "status", "rating.average")\
        .where(nfcan.name == show).toPandas().iloc[0]
    display(HTML(f"<h3>{info['name']}</h3>"))
    display(HTML(f"<p>STATUS: <b>{info['status']}</b> RATING: <b>{info['average']}</b>"))
    display(HTML(info['summary']))
    display(Image(url=info['medium']))
    
    display(YouTubeVideo(id="ULCIHP5dc44"))

interactive(children=(Select(description='Pick A Show:', index=1, options=('#blackAF', 'Bonding', 'Country Com…