In [1]:
val movieDf = spark.read.format("csv")
                    .option("header", true)
                    .option("inferSchema", true)
                    .option("delimitter", ",")
                    .load("hdfs://localhost:9000/ml-latest-small/movies.csv")

movieDf.printSchema()
movieDf.show(2)

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.174.129:4040
SparkContext available as 'sc' (version = 3.1.3, master = local[*], app id = local-1652365162677)
SparkSession available as 'spark'


root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
|      2|  Jumanji (1995)|Adventure|Childre...|
+-------+----------------+--------------------+
only showing top 2 rows



movieDf: org.apache.spark.sql.DataFrame = [movieId: int, title: string ... 1 more field]


In [2]:
// create schema
import org.apache.spark.sql.types.{StringType, StructType, DoubleType, IntegerType, LongType, StructField}

val RatingSchema = StructType (
                                List(
                                    StructField("userId", IntegerType, true),
                                    StructField("movieId", IntegerType, true),
                                      StructField("rating", DoubleType, true),
                                      StructField("timestamp", LongType, true)
                                    )
                    )

import org.apache.spark.sql.types.{StringType, StructType, DoubleType, IntegerType, LongType, StructField}
RatingSchema: org.apache.spark.sql.types.StructType = StructType(StructField(userId,IntegerType,true), StructField(movieId,IntegerType,true), StructField(rating,DoubleType,true), StructField(timestamp,LongType,true))


In [3]:
val ratingDf = spark.read.format("csv")
                    .option("header", true)
                    .option("delimitter", ",")
                    .schema(RatingSchema)
                    .load("hdfs://localhost:9000/ml-latest-small/ratings.csv")

ratingDf.printSchema()
ratingDf.show(2)

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: long (nullable = true)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows



ratingDf: org.apache.spark.sql.DataFrame = [userId: int, movieId: int ... 2 more fields]


In [4]:
// Spark SQL, temp view/temp table created with in spark session
// we can expose the dataframe as table/temp table/view
// no table space, row space created by spark, instaed it is same dataframe and rdd behind sql
movieDf.createOrReplaceTempView("movies")

In [7]:
// now we can use sql queries on movies
// when we execute sql, we get dataframe
val df = spark.sql("SELECT * FROM movies")
df.printSchema()
df.show(2)
df.explain()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
|      2|  Jumanji (1995)|Adventure|Childre...|
+-------+----------------+--------------------+
only showing top 2 rows

== Physical Plan ==
FileScan csv [movieId#16,title#17,genres#18] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[hdfs://localhost:9000/ml-latest-small/movies.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<movieId:int,title:string,genres:string>




df: org.apache.spark.sql.DataFrame = [movieId: int, title: string ... 1 more field]


In [8]:
ratingDf.createOrReplaceTempView("ratings")

In [9]:
val df = spark.sql("SELECT * FROM ratings")
df.printSchema()
df.show(2)
df.explain()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: long (nullable = true)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows

== Physical Plan ==
FileScan csv [userId#38,movieId#39,rating#40,timestamp#41L] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[hdfs://localhost:9000/ml-latest-small/ratings.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<userId:int,movieId:int,rating:double,timestamp:bigint>




df: org.apache.spark.sql.DataFrame = [userId: int, movieId: int ... 2 more fields]


In [14]:
// DIY spark sql
// select 5 reocrds from movies where convert  movie title  in upper case
// select distinct rating from ratings and sort the rating
val df2 = spark.sql("SELECT movieId, upper(title), genres from movies LIMIT 5")
df2.show()
// asc
val df3 = spark.sql("SELECT DISTINCT(rating) FROM ratings ORDER BY rating")
df3.show()

// desc
val df4 = spark.sql("SELECT DISTINCT(rating) FROM ratings ORDER BY rating DESC")
df4.show()

+-------+--------------------+--------------------+
|movieId|        upper(title)|              genres|
+-------+--------------------+--------------------+
|      1|    TOY STORY (1995)|Adventure|Animati...|
|      2|      JUMANJI (1995)|Adventure|Childre...|
|      3|GRUMPIER OLD MEN ...|      Comedy|Romance|
|      4|WAITING TO EXHALE...|Comedy|Drama|Romance|
|      5|FATHER OF THE BRI...|              Comedy|
+-------+--------------------+--------------------+

+------+
|rating|
+------+
|   0.5|
|   1.0|
|   1.5|
|   2.0|
|   2.5|
|   3.0|
|   3.5|
|   4.0|
|   4.5|
|   5.0|
+------+

+------+
|rating|
+------+
|   5.0|
|   4.5|
|   4.0|
|   3.5|
|   3.0|
|   2.5|
|   2.0|
|   1.5|
|   1.0|
|   0.5|
+------+



df2: org.apache.spark.sql.DataFrame = [movieId: int, upper(title): string ... 1 more field]
df3: org.apache.spark.sql.DataFrame = [rating: double]
df4: org.apache.spark.sql.DataFrame = [rating: double]
