In [None]:
# This notebook provides basic examples are interacting with Spark through the pyspark API
# If it is run with the corresponding Docker compose file then the Web UI at http://localhost:4040
# shows details of the jobs executed on the cluster/local machine.
#
# The notebook uses the latest data from IMDB to explore amd run machine learning with Spark.

In [None]:
# Fetch and prepare data
import gzip
from pathlib import Path
import tempfile
import requests


def download(url:str, destination: Path, decompress=True, force:bool=False) -> Path:
    if destination.exists() and not force:
        return destination

    resp = requests.get(url)
    with open(destination, 'wb') as file_handle:
        if decompress:
            file_handle.write(gzip.decompress(resp.content))
        else:
            file_handle.write(resp.content) 

    return destination
   
MOVIE_TITLES_URL = 'https://datasets.imdbws.com/title.basics.tsv.gz'
MOVIE_RATINGS_URL = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
DATA_DIR = Path(tempfile.tempdir) / 'spark-example-data'

DATA_DIR.mkdir(exist_ok=True)
movie_titles_tsv = download(MOVIE_TITLES_URL, DATA_DIR / 'movie-titles.tsv')
movie_ratings_tsv = download(MOVIE_RATINGS_URL, DATA_DIR / 'movie-ratings.tsv')

In [None]:
from pyspark.sql import SparkSession

# Spark session & context
spark = SparkSession.builder.master("local").getOrCreate()

In [None]:
from pyspark.sql.types import BooleanType, StructType, StructField, IntegerType, StringType

# Load data into Spark DataFrame and infer schema
df_ratings = spark.read.csv(str(movie_ratings_tsv), header=True, inferSchema=True, sep="\t")
#df_ratings.printSchema()
# Compute basic stats
df_ratings.describe(["averageRating"]).show()

# Define the schema
titles_schema = StructType([
    StructField("tconst", StringType(), True),
    StructField("titleType", StringType(), True),
    StructField("primaryTitle", StringType(), True),
    StructField("originalTitle", StringType(), True),
    StructField("isAdult", BooleanType(), True),
    StructField("startYear", IntegerType(), True),
    StructField("endYear", IntegerType(), True),
    StructField("runtimeMinutes", IntegerType(), False),
    StructField("genres", StringType(), True)
])

df_titles = spark.read.csv(str(movie_titles_tsv), header=True, schema=titles_schema, sep="\t", mode="DROPMALFORMED")\
    .filter("titleType = 'movie' AND startYear > 2019 AND startYear < 2024").drop("endYear","isAdult","genres")
#df_titles.printSchema()
df_titles.show()

In [None]:
# Joins and selects are accessible by the DataFrame API
df_rated_titles = df_titles.join(df_ratings, df_titles.tconst == df_ratings.tconst).drop("tconst")
df_rated_titles.select(["primaryTitle", "averageRating"]).sort("averageRating", ascending=False).show()

In [None]:
# Raw SQL queries can be applied to temporary views of the dataframes
df_titles.createOrReplaceTempView('MovieTitles')
df_ratings.createOrReplaceTempView('MovieRatings')
spark.sql("""SELECT primaryTitle, averageRating
FROM MovieTitles
INNER JOIN MovieRatings
    ON MovieTitles.tconst = MovieRatings.tconst
ORDER BY averageRating DESC""").show()


In [None]:
# Any correlations in the data?
import pandas as pd

numeric_features = [t[0] for t in df_rated_titles.dtypes if t[1] == 'int' or t[1] == 'double']
sampled_data = df_rated_titles.select(numeric_features).sample(False, 0.8).toPandas()
axs = pd.plotting.scatter_matrix(sampled_data, figsize=(10, 10))
n = len(sampled_data.columns)
for i in range(n):
    v = axs[i, 0]
    v.yaxis.label.set_rotation(0)
    v.yaxis.label.set_ha('right')
    v.set_yticks(())
    h = axs[n-1, i]
    h.xaxis.label.set_rotation(90)
    h.set_xticks(())

In [None]:
# Determine correlation between independent variables
for i in df_rated_titles.columns:
    if not isinstance(df_rated_titles.select(i).take(1)[0][0], str):
        print( "Correlation to averageRating for ", i, df_rated_titles.stat.corr('averageRating',i))