In [1]:
# Similar to SparkContext, for SparkSQL you need a SparkSession
from pyspark.sql import SparkSession
# Also all the functions (select, where, groupby) needs to be imported
from pyspark.sql.functions import *

In [2]:
# Get spark session
spark = SparkSession.builder.getOrCreate()

In [3]:
# read data into dataframe
ratings_df = spark.read.csv("/FileStore/tables/movielens/ratings.csv", header=True)

### DataFrames Operations

In this part you will learn how to programmatically use the SQL capabilities of DataFrame. For the full list of documentation: https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql

In [5]:
# You can use the select method to grab specific columns
display(ratings_df.select(['movieId','rating']))

In [6]:
# see how ratings are in string
ratings_df.printSchema()

In [7]:
# You can change the Data type of any column by casting them to your desired data type
# First you have to import that data type from pyspark.sql.types
from pyspark.sql.types import IntegerType
# Then you can use withColumn() to apply / cast each row of the column (Notice how the square bracket annotation is used)
ratings_df = ratings_df.withColumn("rating", ratings_df['rating'].cast(IntegerType()))
# take a look at the schema now
ratings_df.select(['movieId','rating']).printSchema()

In [8]:
# You can use the filter() here to filter on a condition (just like we did with RDD!)
# For example we can check if there are any missing ratings 
ratings_df.filter(ratings_df.rating.isNull()).count()

In [9]:
# similar to filter you can also use where (from SQL syntax)
ratings_df.where(ratings_df.rating.isNull()).count()

### Group By
The GROUP BY statement is used with **aggregate functions (COUNT, MAX, MIN, SUM, AVG)** to group the result-set by one or more columns.

In [11]:
# For instance, we can group by the movieId over rating and aggregate over the average value and total reviews (very easily)
display(ratings_df.groupBy('movieId').agg(avg('rating').alias('avg_rating'), count('rating').alias('reviews')))

In [12]:
# We can also see the top 10 rated movies if they have been reviewed at least 50 times or more
ratings_sum_df = ratings_df.groupBy('movieId').agg(avg('rating').alias('avg_rating'), count('rating').alias('reviews'))
display(ratings_sum_df.filter(ratings_sum_df.reviews > 50).sort('avg_rating', ascending=False).limit(10))

### User Defined Functions (UDF)
Similar to custom functions for Map, you can write user defined function to transform one or more columns. 
More about UDF on https://docs.databricks.com/spark/latest/spark-sql/udf-in-python.html

In [14]:
# Using UDF is a three step process. Before anything you will need to import the udf library
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [15]:
# If you can express your user defined function as lambda then you can register the UDF and define it in one line like below
# for example this UDF will tell me if I should watch a movie or not based on its average rating
watchable_udf = udf(lambda avg_rating: 'yes' if avg_rating > 3.5 else 'no', StringType())

In [16]:
# Otherwise you can first write your function
# as you can see here we have more flexibility
# I will write the function to also incorporate the total number of reviews
def watchable_udf(avg_rating, reviews):
  if avg_rating > 3.5 and reviews > 50:
    return 'yes'
  elif avg_rating > 3.5 and reviews < 50:
    return 'maybe'
  else:
    return 'no'
# and then register it as an UDF with the return type declared
watchable_udf = udf(watchable_udf, StringType())

In [17]:
# Now you can use withColumn to apply the UDF over every row and create a new column 'watchable'
ratings_sum_df = ratings_sum_df.withColumn('watchable', watchable_udf(ratings_sum_df.avg_rating,ratings_sum_df.reviews))

In [18]:
display(ratings_sum_df)

### Joins
A JOIN clause is used to combine rows from two or more tables, based on a related column between them. Here are the a few basic types of joins explained:

* (INNER) JOIN: Returns records that have matching values in both tables
* LEFT (OUTER) JOIN: Return all records from the left table, and the matched records from the right table
* RIGHT (OUTER) JOIN: Return all records from the right table, and the matched records from the left table
* FULL (OUTER) JOIN: Return all records when there is a match in either left or right table

Spark Supports more than just basic joins however. With the latest spark you get: inner, cross, outer, full, full_outer, left, left_outer, right, right_outer, left_semi, and left_anti joins! Take a look in  https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#join for more details.

In [20]:
# lets use the movies csv file to make sense of the movies in our previous results
movies_df = spark.read.csv("/FileStore/tables/movielens/movies.csv", header=True)
display(movies_df)

In [21]:
# we do an inner join to get more information about each movies
movie_ratings_sum_df = ratings_sum_df.join(movies_df, ratings_sum_df.movieId == movies_df.movieId)

In [22]:
# lets display a few results
display(movie_ratings_sum_df.select(['title','avg_rating','reviews','watchable']))

### Challenge: Can you create a table of the highest rated movie per category?