# Getting Started With Spark using Scala

In [1]:
val data = 1 to 30

// most RDD operations have identical or nearly identical syntax:
val xrangeRDD = sc.parallelize(data, 4)
val subRDD = xrangeRDD.map(x => x-1)
val filteredRDD = subRDD.filter(x => x<10)
filteredRDD.collect()
filteredRDD.count()

data = Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30)
xrangeRDD = ParallelCollectionRDD[0] at parallelize at <console>:30
subRDD = MapPartitionsRDD[1] at map at <console>:31
filteredRDD = MapPartitionsRDD[2] at filter at <console>:32


10

In [2]:
print(sc.version)
sc

2.3.0

In [3]:
val test = sc.parallelize(1 to 50000,50)
//cache this data
test.cache

val t1 = System.nanoTime()
// first count will trigger evaluation of count *and* cache
test.count
val dt1 = (System.nanoTime() - t1).toDouble/1.0e9

val t2 = System.nanoTime()
// second count operates on cached data only
test.count
val dt2 = (System.nanoTime() - t2).toDouble/1.0e9

test = ParallelCollectionRDD[3] at parallelize at <console>:27
t1 = 454365386432198
dt1 = 0.305775544
t2 = 454365692225735
dt2 = 0.122984662


0.122984662

In [4]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession
  .builder()
  .appName("Spark SQL basic example")
  .config("spark.some.config.option", "some-value")
  .getOrCreate()

// For implicit conversions like converting RDDs to DataFrames
import spark.implicits._

spark = org.apache.spark.sql.SparkSession@7189fa57


In [5]:
//import spark.implicits._ 
//used for $ notation

val df = spark.read.json("people.json")
df.show
df.printSchema

// Register the DataFrame as a SQL temporary view
df.createTempView("people")

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



df = [age: bigint, name: string]


[age: bigint, name: string]

In [6]:
//neeed to register dataframe to use (hive like) sql
println("Query 1: select statements")
df.select("name").show
spark.sql("SELECT name FROM people").show

Query 1: select statements
+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [7]:
println("Query 2: filter statements")
df.filter(df("age") > 21).show
df.filter($"age" > 21).show
spark.sql("SELECT age, name FROM people WHERE age > 21").show

Query 2: filter statements
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [8]:
println("Query 3: group by statements")
df.groupBy("age").count().show
spark.sql("SELECT age, COUNT(age) as count FROM people GROUP BY age").show

Query 3: group by statements
+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    0|
|  30|    1|
+----+-----+

