In [1]:
# activate Spark in our Colab notebook.
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-3.0.2'
# spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com] [1 InRelease 14.2 kB/88.7 kB 16%] [Connec                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:8 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:10 http://ppa.launchpad.net/cran/libgit2/u

In [2]:
#import packages

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType,IntegerType

# we are going to use this to time our queries.
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [3]:
# Read in data from S3 Bucket
from pyspark import SparkFiles

# Define url names
url_titles = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-netflix/titles_basic.csv"
url_ratings = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-netflix/ratings.csv"
url_principals = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-netflix/filteredPrincipals.csv"
url_filter_names = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-netflix/filterNames.csv"
urls = [url_titles, url_ratings, url_principals, url_principals, url_filter_names]

#Download all files
for url in urls:
  spark.sparkContext.addFile(url)

df_titles_basic= spark.read.csv(SparkFiles.get("titles_basic.csv"), sep=",", header=True)
df_ratings = spark.read.csv(SparkFiles.get("ratings.csv"), sep=",", header=True)
df_principals = spark.read.csv(SparkFiles.get("filteredPrincipals.csv"), sep=",", header=True)
df_filter_names = spark.read.csv(SparkFiles.get("filterNames.csv"), sep=",", header=True)

In [4]:
# Create Temp Views for each table
df_titles_basic.createOrReplaceTempView("titles")
df_ratings.createOrReplaceTempView("ratings")
df_principals = df_principals.dropDuplicates()
df_principals.createOrReplaceTempView("principals")
df_filter_names.createOrReplaceTempView("filterNames")

In [None]:
# All producers and the movies they are known for 
producers = spark.sql("select nconst, primaryname, knownfortitles from filterNames where primaryprofession like '%producer%' group by nconst, primaryname, knownfortitles")
producers.show()
producers.createOrReplaceTempView('producers')

+---------+--------------------+--------------------+
|   nconst|         primaryname|      knownfortitles|
+---------+--------------------+--------------------+
|nm7745824|         Poppy Begum|tt6400730,tt76105...|
|nm7748384|        Woody Daigle|           tt4687782|
|nm1447145|Jennifer M. Fah-V...|tt0068120,tt09611...|
|nm1443310|      Michael Cotter|tt0805666,tt17915...|
|nm1447108|        Dan Mogensen|tt5896744,tt32810...|
|nm4943760|         Tom Gretzer|           tt2294661|
|nm4947076|      Kerstin Freels|tt0144050,tt59741...|
|nm1447447|      Brian Schulman|tt1139628,tt62178...|
|nm1449220|       Roman Avianus|tt0440963,tt03721...|
|nm0461876|  Michael Knötzinger|tt2278621,tt02597...|
|nm7017597|   Pablo G. Schuller|tt10077544,tt7818...|
|nm3128982|         Scott Weiss|           tt1475139|
|nm7743032|      Courtney Doyle|           tt5095514|
|nm4943717|  Courtney Hitchcock|tt2294829,tt10649...|
|nm8475440|      Tatjana Anders|tt5755816,tt10530...|
|nm8480212|        Jordan Fr




**Ensure there are no duplicates**




In [None]:
spark.sql('select count(*) from producers').show()

+--------+
|count(1)|
+--------+
|  574401|
+--------+



In [None]:
spark.sql('select count( distinct nconst) from producers').show()

+----------------------+
|count(DISTINCT nconst)|
+----------------------+
|                574401|
+----------------------+



**Select all producers and the foreign key to every title for which they are known**

In [None]:
producers_final = spark.sql('select producers.nconst, producers.primaryname, titles_final.tconst from producers left join titles_final on producers.knownfortitles like concat("%",titles_final.tconst,"%")')

In [None]:
producers_final.show()
producers_final.createOrReplaceTempView('prod_final')

+---------+--------------------+----------+
|   nconst|         primaryname|    tconst|
+---------+--------------------+----------+
|nm7745824|         Poppy Begum| tt6400730|
|nm7745824|         Poppy Begum| tt7610596|
|nm7745824|         Poppy Begum| tt4116046|
|nm7745824|         Poppy Begum|tt10196182|
|nm7748384|        Woody Daigle| tt4687782|
|nm1447145|Jennifer M. Fah-V...| tt0896084|
|nm1447145|Jennifer M. Fah-V...| tt0961102|
|nm1447145|Jennifer M. Fah-V...| tt0437758|
|nm1443310|      Michael Cotter| tt2177489|
|nm1443310|      Michael Cotter| tt1791528|
|nm1443310|      Michael Cotter| tt0285403|
|nm1443310|      Michael Cotter| tt0805666|
|nm1447108|        Dan Mogensen| tt1042453|
|nm1447108|        Dan Mogensen| tt2126045|
|nm1447108|        Dan Mogensen| tt5896744|
|nm1447108|        Dan Mogensen| tt3281048|
|nm4943760|         Tom Gretzer| tt2294661|
|nm4947076|      Kerstin Freels| tt9262068|
|nm4947076|      Kerstin Freels|tt11701822|
|nm1447447|      Brian Schulman|

In [None]:
spark.sql('select * from prod_final where tconst like "%tt2177489%"').show()

+---------+--------------------+---------+
|   nconst|         primaryname|   tconst|
+---------+--------------------+---------+
|nm1443310|      Michael Cotter|tt2177489|
|nm0073504|       Dan Berendsen|tt2177489|
|nm0229936|      Shannon Dobson|tt2177489|
|nm2936913|        Allison Rush|tt2177489|
|nm2646410|        Mark Charran|tt2177489|
|nm2337089|         Beth Fraser|tt2177489|
|nm3200587|      Eric Zimmerman|tt2177489|
|nm6591647|Tomas Javier Guardia|tt2177489|
|nm0930397|     Davida Williams|tt2177489|
|nm0774236|     Brian Schnuckel|tt2177489|
|nm8271257|          Tony Towns|tt2177489|
|nm4219090|     Melanie Manooki|tt2177489|
|nm3475387|        Derek Theler|tt2177489|
|nm0555684|        Timothy Marx|tt2177489|
|nm4054704|  Jose Moreno Brooks|tt2177489|
|nm4754902|        Braden Davis|tt2177489|
|nm4755426|    Robbie Silverman|tt2177489|
|nm4388045|       Angie Liggett|tt2177489|
|nm2085006|      Heidi Clements|tt2177489|
|nm1052109|        Janae Bakken|tt2177489|
+---------+

In [5]:
# Create table of all actors
# If birth year is known and death year is known, actor is dead, 
# if birth year is know and death year is unknown, then actor is assumed to be alive,
# otherwis actor status is null

actors = spark.sql("select nconst , primaryname, birthyear, deathyear, primaryprofession, CASE  WHEN (deathyear IS NOT NULL) THEN 0 WHEN (birthyear IS NOT NULL) and (deathyear IS NULL) THEN 1 ELSE NULL END as status from filterNames where primaryprofession like '%act%' group by nconst , primaryname, birthyear, deathyear, primaryprofession")


In [6]:
actors.show()

+----------+--------------------+---------+---------+--------------------+------+
|    nconst|         primaryname|birthyear|deathyear|   primaryprofession|status|
+----------+--------------------+---------+---------+--------------------+------+
| nm0462116|          Jeff Kober|     1953|     null|actor,writer,art_...|     1|
| nm0462319|         Herman Koch|     1953|     null|        actor,writer|     1|
| nm7741448|      Thomas Ganidel|     null|     null|               actor|  null|
| nm7744565|Charles-André Lac...|     null|     null|               actor|  null|
| nm7752147|     Gaihlin St-Onge|     null|     null|             actress|  null|
| nm1443327|   Robert Desrosiers|     null|     null|actor,sound_depar...|  null|
| nm1443573|          Alan Stone|     null|     null|               actor|  null|
| nm1445190|         Siu-Yan Cha|     null|     null|             actress|  null|
| nm1445529|       Glenn Hanning|     null|     null|               actor|  null|
| nm1443396|    

In [8]:
# Create temp View Showing alive and dead actor status 
actors.createOrReplaceTempView('actors')
actors_n_age = spark.sql("select *, CASE WHEN status = 0 and birthyear IS NOT NULL THEN deathyear - birthyear WHEN status = 1 THEN 2021 - birthyear ELSE NULL END AS age from actors")


In [10]:
# Create Temp View from queried data
actors_n_age.createOrReplaceTempView('actors_final')

# Query first 5 lines to confirm that data is as expected
spark.sql('select nconst, primaryname, status, age  from actors_final').show()

+----------+--------------------+------+----+
|    nconst|         primaryname|status| age|
+----------+--------------------+------+----+
| nm0462116|          Jeff Kober|     1|68.0|
| nm0462319|         Herman Koch|     1|68.0|
| nm7741448|      Thomas Ganidel|  null|null|
| nm7744565|Charles-André Lac...|  null|null|
| nm7752147|     Gaihlin St-Onge|  null|null|
| nm1443327|   Robert Desrosiers|  null|null|
| nm1443573|          Alan Stone|  null|null|
| nm1445190|         Siu-Yan Cha|  null|null|
| nm1445529|       Glenn Hanning|  null|null|
| nm1443396|     Fabian Hinrichs|     1|47.0|
| nm1445313|     Kate Dorrington|  null|null|
| nm1445861|Christopher Marti...|     0|68.0|
| nm4943457|        Pierson Fode|     1|30.0|
| nm4946463|         Derek Yates|  null|null|
| nm7743441|         Jordan Kent|  null|null|
|nm10586905|    Sheena Catacutan|  null|null|
|nm10589858|        Crystal Cook|  null|null|
| nm3126747|         Aayam Mehta|  null|null|
| nm3130303|            Aj Platt| 

**Ensure there are no duplicates**

In [None]:
spark.sql("select count(nconst) from actors_final").show()

+-------------+
|count(nconst)|
+-------------+
|      2042638|
+-------------+



In [None]:
spark.sql("select count( distinct nconst) from actors_final").show()

+----------------------+
|count(DISTINCT nconst)|
+----------------------+
|               2042638|
+----------------------+



In [None]:
spark.sql('select * from actors_final where birthyear IS NULL and deathyear IS NOT NULL').show()

+---------+--------------------+---------+---------+--------------------+------+----+
|   nconst|         primaryname|birthyear|deathyear|   primaryprofession|STATUS| AGE|
+---------+--------------------+---------+---------+--------------------+------+----+
|nm0492548|        Gerri Lawlor|     null|     2019|actress,soundtrac...|     0|null|
|nm0653543|Giant Gustav Clau...|     null|     2019|               actor|     0|null|
|nm3365636|     Leonor Coronado|     null|     2013|             actress|     0|null|
|nm0102294|       Marilyn Boyle|     null|     2016|             actress|     0|null|
|nm1264194|      Andreu Solsona|     null|     2018|               actor|     0|null|
|nm1338353|       Shafiq Ansari|     null|     2020|writer,assistant_...|     0|null|
|nm0383360|       Colin Higgins|     null|     2012| actor,miscellaneous|     0|null|
|nm0222644|      Malcolm Devine|     null|     2016|actor,writer,dire...|     0|null|
|nm1258281|          Sunil Rege|     null|     2011|  

In [None]:
spark.sql('select * from titles limit 5').show()

+---+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+
|_c0|   tconst|titletype|        primarytitle|       originaltitle|isadult|startyear|endyear|runtimeminutes|           genres|
+---+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+
|  0|tt0439997|    movie|           500 Almas|           500 Almas|      0|     2004|   null|         105.0|      Documentary|
|  1|tt0439999|tvSpecial|           80s Mania|           80s Mania|      0|     2001|   null|          50.0|            Music|
|  3|tt0440003|    movie|        A-1 Headline|         A1 tou tiao|      0|     2004|   null|          95.0| Mystery,Thriller|
|  4|tt0440004|  tvMovie| AD/BC: A Rock Opera| AD/BC: A Rock Opera|      0|     2004|   null|          30.0|   Comedy,Musical|
|  5|tt0440008|  tvMovie|Abbamania: We Say...|Abbamania: We Say...|      0|     2004|   null|          50.0|Doc

In [None]:
spark.sql('select * from ratings limit 5').show()

+---------+-------------+--------+
|   tconst|averagerating|numvotes|
+---------+-------------+--------+
|tt0214461|          7.4|      74|
|tt0214659|          6.0|      21|
|tt0214878|          6.1|    1566|
|tt0215244|          3.0|       5|
|tt0215402|          8.1|       9|
+---------+-------------+--------+



Films Table

In [None]:
# Left join all movies in the titles table with their corresponding ratings from the ratings data.
titles_final = spark.sql('select titles.tconst, titles.originaltitle, ratings.averagerating, titles.titletype, titles.isadult, concat(RIGHT(concat("000",cast(titles.runtimeminutes/60 as int)),3),":",RIGHT(concat("00",cast(titles.runtimeminutes%60 as int)),2)) as runtime from titles left join ratings on titles.tconst = ratings.tconst')
titles_final.show()

+---------+--------------------+-------------+---------+-------+-------+
|   tconst|       originaltitle|averagerating|titletype|isadult|runtime|
+---------+--------------------+-------------+---------+-------+-------+
|tt0439997|           500 Almas|          7.2|    movie|      0| 001:45|
|tt0439999|           80s Mania|         null|tvSpecial|      0| 000:50|
|tt0440003|         A1 tou tiao|          5.9|    movie|      0| 001:35|
|tt0440004| AD/BC: A Rock Opera|          7.4|  tvMovie|      0| 000:30|
|tt0440008|Abbamania: We Say...|         null|  tvMovie|      0| 000:50|
|tt0440016|       Ah ma yau nan|          5.5|    movie|      0| 001:33|
|tt0440022|        Al atardecer|         null|  tvMovie|      0| 001:06|
|tt0440035|      L'amour en pen|         null|  tvMovie|      0| 000:52|
|tt0440067|      Bau lit do see|          5.4|    movie|      0| 001:39|
|tt0440078|  The Band Aid Story|          8.1|  tvMovie|      0| 001:35|
|tt0440084|A Beachcombers Ch...|          7.0|  tvM

**Ensure there are no duplicates.**

In [None]:
titles_final.createOrReplaceTempView('titles_final')
spark.sql('select count(*) from titles_final limit 5').show()

+--------+
|count(1)|
+--------+
|  649067|
+--------+



In [None]:
spark.sql('select count(distinct tconst) from titles_final').show()

+----------------------+
|count(DISTINCT tconst)|
+----------------------+
|                649067|
+----------------------+



In [None]:
spark.sql('select count(distinct tconst) from rated_titles').show()

+----------------------+
|count(DISTINCT tconst)|
+----------------------+
|                649067|
+----------------------+



In [None]:
spark.sql('select runtimeminutes, concat(cast(runtimeminutes/60 as int),":",cast(runtimeminutes%60 as int)) from titles limit 5').show()


+--------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|runtimeminutes|concat(CAST(CAST((CAST(runtimeminutes AS DOUBLE) / CAST(60 AS DOUBLE)) AS INT) AS STRING), :, CAST(CAST((CAST(runtimeminutes AS DOUBLE) % CAST(60 AS DOUBLE)) AS INT) AS STRING))|
+--------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|         105.0|                                                                                                                                                                             1:45|
|          50.0|                                                                                                                                                                             0:50|
|          95.0|         

In [None]:
spark.sql('select * from principals limit 5').show()

+---------+--------+---------+--------+----+--------------------+
|   tconst|ordering|   nconst|category| job|          characters|
+---------+--------+---------+--------+----+--------------------+
|tt9779068|       5|nm0489214|director|null|                null|
|tt0449029|       1|nm0005516|   actor|null|"[""Gregory Fletc...|
|tt0452786|       1|nm0766100|    self|null|        "[""Self""]"|
|tt0940890|       2|nm0322977|   actor|null|     "[""Nebojsa""]"|
|tt5345728|       8|nm7855195|composer|null|                null|
+---------+--------+---------+--------+----+--------------------+



In [None]:
characters = spark.sql('''select characters, tconst, nconst from principals where category in ("actor", "actress", "self") and characters IS NOT NULL''')

In [None]:
# Display the characters and their corresponding movies and actors
characters_final = characters.dropDuplicates()
characters_final.show()

+--------------------+----------+----------+
|          characters|    tconst|    nconst|
+--------------------+----------+----------+
| "[""Pomegranate""]"| tt1725077| nm4086624|
|      "[""Fraile""]"| tt0446914| nm0781103|
|     "[""Penalty""]"| tt8975184| nm3067739|
|"[""Self - Contes...|tt13984052|nm12242252|
| "[""Bimbo Coles""]"| tt0451016| nm1871139|
|        "[""Self""]"| tt0443374| nm2092808|
|        "[""Self""]"|tt12252890| nm1432434|
|       "[""Angel""]"| tt2034049| nm0000327|
|         "[""Zac""]"| tt6072502| nm8872970|
|   "[""Lady Gaga""]"| tt6743882| nm3078932|
|"[""Shalini (Boss...| tt9547758| nm1427076|
|  "[""Candy Kiss""]"|tt10743654| nm6629894|
|"[""Duke Sagrado""]"| tt1781844| nm0320282|
|"[""Martin Solvei...|tt10479256| nm1528519|
|"[""Self - Dancer...| tt2048152| nm2612563|
|"[""Robert Braula...| tt6607896| nm8825458|
| "[""Pvt. Cooper""]"| tt0280609| nm0571727|
|   "[""Young boy""]"| tt8582012| nm7640138|
|        "[""Self""]"| tt1797546| nm0776441|
|        "

In [None]:
spark.sql('select count(*) from uchar').show()

+--------+
|count(1)|
+--------+
| 2695329|
+--------+



In [None]:
spark.sql('select count(distinct characters) from uchar').show()

+--------------------------+
|count(DISTINCT characters)|
+--------------------------+
|                    688274|
+--------------------------+



In [None]:
characters.createor
spark.sql('select * from characters_final')

In [None]:
spark.sql('select * from filterNames limit 5').show()

+---------+---------------+---------+---------+--------------------+--------------------+---------+
|   nconst|    primaryname|birthyear|deathyear|   primaryprofession|      knownfortitles| knownFor|
+---------+---------------+---------+---------+--------------------+--------------------+---------+
|nm0458594|  Roger Kleiber|     null|     null|visual_effects,ed...|tt0405022,tt19579...|tt0497467|
|nm0458601|     Mark Kleid|     null|     null|               actor|tt0105041,tt01120...|tt0277615|
|nm0458616|   Mark Kleiman|     null|     null|editor,producer,m...|tt0196091,tt01256...|tt0972846|
|nm0458617|Michael Kleiman|     1969|     null|camera_department...|tt0913354,tt04311...|tt0431197|
|nm0458618|   Naum Kleiman|     1937|     null|director,actor,wr...|tt3066262,tt64875...|tt3066262|
+---------+---------------+---------+---------+--------------------+--------------------+---------+

