In [1]:
val movieDf = spark.read.format("csv")
                    .option("header", true)
                    .option("inferSchema", true)
                    .option("delimitter", ",")
                    .load("hdfs://localhost:9000/ml-latest-small/movies.csv")

movieDf.printSchema()
movieDf.show(2)

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.174.129:4041
SparkContext available as 'sc' (version = 3.1.3, master = local[*], app id = local-1652301351056)
SparkSession available as 'spark'


root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
|      2|  Jumanji (1995)|Adventure|Childre...|
+-------+----------------+--------------------+
only showing top 2 rows



movieDf: org.apache.spark.sql.DataFrame = [movieId: int, title: string ... 1 more field]


In [2]:
// create schema
import org.apache.spark.sql.types.{StringType, StructType, DoubleType, IntegerType, LongType, StructField}

val RatingSchema = StructType (
                                List(
                                    StructField("userId", IntegerType, true),
                                    StructField("movieId", IntegerType, true),
                                      StructField("rating", DoubleType, true),
                                      StructField("timestamp", LongType, true)
                                    )
                    )

import org.apache.spark.sql.types.{StringType, StructType, DoubleType, IntegerType, LongType, StructField}
RatingSchema: org.apache.spark.sql.types.StructType = StructType(StructField(userId,IntegerType,true), StructField(movieId,IntegerType,true), StructField(rating,DoubleType,true), StructField(timestamp,LongType,true))


In [3]:
val ratingDf = spark.read.format("csv")
                    .option("header", true)
                    .option("delimitter", ",")
                    .schema(RatingSchema)
                    .load("hdfs://localhost:9000/ml-latest-small/ratings.csv")

ratingDf.printSchema()
ratingDf.show(2)

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: long (nullable = true)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows



ratingDf: org.apache.spark.sql.DataFrame = [userId: int, movieId: int ... 2 more fields]


In [4]:
import org.apache.spark.sql.functions.{col}

// accessing column
val df = ratingDf.filter ( ratingDf("rating") > 1.0)
df.count()

import org.apache.spark.sql.functions.col
df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: int, movieId: int ... 2 more fields]
res2: Long = 96655


In [5]:
// access col using col
val df = ratingDf.filter ( col("rating") > 1.0)
df.count()

df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: int, movieId: int ... 2 more fields]
res3: Long = 96655


In [6]:
// access col using $
val df = ratingDf.filter ( $"rating" > 1.0)
df.count()

df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: int, movieId: int ... 2 more fields]
res4: Long = 96655


In [8]:
// access col using ' single quote
val df = ratingDf.filter ( 'rating > 1.0)
df.count()

df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: int, movieId: int ... 2 more fields]
res5: Long = 96655


In [11]:
import org.apache.spark.sql.functions.{count, avg, desc}

val popularDf = ratingDf
                        .groupBy($"movieId")
                        .agg(count($"userId").alias("total_ratings"))
                        .sort (desc("total_ratings"))
                             
popularDf.printSchema()
popularDf.show(2)

root
 |-- movieId: integer (nullable = true)
 |-- total_ratings: long (nullable = false)

+-------+-------------+
|movieId|total_ratings|
+-------+-------------+
|    356|          329|
|    318|          317|
+-------+-------------+
only showing top 2 rows



import org.apache.spark.sql.functions.{count, avg, desc}
popularDf: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [movieId: int, total_ratings: bigint]
