# Testing Spark with Scala kernel

In [1]:
val rdd = sc.parallelize(Array.range(1, 100),2)

In [2]:
rdd.collect

Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99)

In [3]:
rdd.count

99

In [4]:
val a = rdd.map(x => 2*x).reduce(_ + _)

In [5]:
println(a)

9900


In [6]:
import org.apache.spark._
import org.apache.spark.graphx._
// To make some of the examples work we will also need RDD
import org.apache.spark.rdd.RDD

In [7]:
class VertexProperty()
case class UserProperty(val name: String) extends VertexProperty
case class ProductProperty(val name: String, val price: Double) extends VertexProperty
// The graph might then have the type:
var graph: Graph[VertexProperty, String] = null

In [8]:
// Create an RDD for the vertices
val users: RDD[(VertexId, (String, String))] =
  sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
                       (5L, ("franklin", "prof")), (2L, ("istoica", "prof"))))
// Create an RDD for edges
val relationships: RDD[Edge[String]] =
  sc.parallelize(Array(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
                       Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi")))
// Define a default user in case there are relationship with missing user
val defaultUser = ("John Doe", "Missing")
// Build the initial Graph
val graph = Graph(users, relationships, defaultUser)

In [9]:
//val graph: Graph[(String, String), String] // Constructed from above
// Count all users which are postdocs
graph.vertices.filter { case (id, (name, pos)) => pos == "postdoc" }.count
// Count all the edges where src > dst
graph.edges.filter(e => e.srcId > e.dstId).count

1

In [10]:
graph.edges.filter { case Edge(src, dst, prop) => src > dst }.count

1

In [11]:
val facts: RDD[String] =
  graph.triplets.map(triplet =>
    triplet.srcAttr._1 + " is the " + triplet.attr + " of " + triplet.dstAttr._1)
facts.collect.foreach(println(_))

istoica is the colleague of franklin
rxin is the collab of jgonzal
franklin is the advisor of rxin
franklin is the pi of jgonzal


In [12]:
// Create an RDD for the vertices
val users: RDD[(VertexId, (String, String))] =
  sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
                       (5L, ("franklin", "prof")), (2L, ("istoica", "prof")),
                       (4L, ("peter", "student"))))
// Create an RDD for edges
val relationships: RDD[Edge[String]] =
  sc.parallelize(Array(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
                       Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"),
                       Edge(4L, 0L, "student"),   Edge(5L, 0L, "colleague")))
// Define a default user in case there are relationship with missing user
val defaultUser = ("John Doe", "Missing")
// Build the initial Graph
val graph = Graph(users, relationships, defaultUser)
// Notice that there is a user 0 (for which we have no information) connected to users
// 4 (peter) and 5 (franklin).
graph.triplets.map(
    triplet => triplet.srcAttr._1 + " is the " + triplet.attr + " of " + triplet.dstAttr._1
  ).collect.foreach(println(_))
// Remove missing vertices as well as the edges to connected to them
val validGraph = graph.subgraph(vpred = (id, attr) => attr._2 != "Missing")
// The valid subgraph will disconnect users 4 and 5 by removing user 0
validGraph.vertices.collect.foreach(println(_))
validGraph.triplets.map(
    triplet => triplet.srcAttr._1 + " is the " + triplet.attr + " of " + triplet.dstAttr._1
  ).collect.foreach(println(_))

istoica is the colleague of franklin
rxin is the collab of jgonzal
peter is the student of John Doe
franklin is the colleague of John Doe
franklin is the advisor of rxin
franklin is the pi of jgonzal
(4,(peter,student))
(3,(rxin,student))
(7,(jgonzal,postdoc))
(5,(franklin,prof))
(2,(istoica,prof))
istoica is the colleague of franklin
rxin is the collab of jgonzal
franklin is the advisor of rxin
franklin is the pi of jgonzal


In [13]:
// Run Connected Components
val ccGraph = graph.connectedComponents() // No longer contains missing field
// Remove missing vertices as well as the edges to connected to them
val validGraph = graph.subgraph(vpred = (id, attr) => attr._2 != "Missing")
// Restrict the answer to the valid subgraph
val validCCGraph = ccGraph.mask(validGraph)

In [14]:
// Import random graph generation library
import org.apache.spark.graphx.util.GraphGenerators
// Create a graph with "age" as the vertex property.  Here we use a random graph for simplicity.
val graph: Graph[Double, Int] =
  GraphGenerators.logNormalGraph(sc, numVertices = 100).mapVertices( (id, _) => id.toDouble )
// Compute the number of older followers and their total age
val olderFollowers: VertexRDD[(Int, Double)] = graph.aggregateMessages[(Int, Double)](
  triplet => { // Map Function
    if (triplet.srcAttr > triplet.dstAttr) {
      // Send message to destination vertex containing counter and age
      triplet.sendToDst(1, triplet.srcAttr)
    }
  },
  // Add counter and age
  (a, b) => (a._1 + b._1, a._2 + b._2) // Reduce Function
)
// Divide total age by number of older followers to get average age of older followers
val avgAgeOfOlderFollowers: VertexRDD[Double] =
  olderFollowers.mapValues( (id, value) => value match { case (count, totalAge) => totalAge / count } )
// Display the results
avgAgeOfOlderFollowers.collect.foreach(println(_))

(19,56.52173913043478)
(39,62.888888888888886)
(34,70.07142857142857)
(4,48.5)
(71,87.44444444444444)
(66,79.0)
(80,84.2)
(65,78.5)
(11,66.17647058823529)
(14,70.5)
(35,58.1764705882353)
(24,61.333333333333336)
(37,72.46153846153847)
(1,48.5)
(74,80.25)
(63,80.625)
(89,90.0)
(17,57.44444444444444)
(18,64.05555555555556)
(12,54.7)
(38,59.111111111111114)
(20,60.72727272727273)
(90,98.0)
(94,98.0)
(41,68.54545454545455)
(21,49.0)
(77,83.5)
(53,73.6923076923077)
(22,60.6)
(25,61.857142857142854)
(46,75.3)
(59,78.84615384615384)
(62,92.14285714285714)
(93,97.0)
(33,64.0)
(23,59.714285714285715)
(40,65.375)
(6,60.3)
(67,73.0)
(69,86.0)
(3,45.4)
(85,86.33333333333333)
(58,72.4)
(60,81.72727272727273)
(86,96.0)
(91,94.0)
(31,66.72222222222223)
(26,64.6923076923077)
(5,53.625)
(2,58.65)
(13,50.92307692307692)
(96,98.0)
(52,79.0)
(81,88.75)
(16,47.5)
(55,79.25)
(82,86.0)
(28,73.91666666666667)
(29,56.529411764705884)
(79,88.75)
(54,76.0)
(30,60.2)
(50,69.54545454545455)
(36,70.125)
(92,99.0)
(6

In [None]:
%AddDeps com.databricks spark-csv_2.10 1.3.0 --transitive

:: loading settings :: url = jar:file:/opt/spark-1.5.2-bin-hadoop2.6/lib/spark-assembly-1.5.2-hadoop2.6.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
:: resolving dependencies :: com.ibm.spark#spark-kernel;working
	confs: [default]
	found com.databricks#spark-csv_2.10;1.3.0 in central
	found org.apache.commons#commons-csv;1.1 in central
	found com.univocity#univocity-parsers;1.5.1 in central
	found org.scoverage#scalac-scoverage-runtime_2.10;1.1.0 in central
	found org.scoverage#scalac-scoverage-plugin_2.10;1.1.0 in central
	found org.apache.spark#spark-core_2.10;1.5.0 in central
	found com.google.guava#guava;14.0.1 in central
	found com.google.code.findbugs#jsr305;1.3.9 in central
	found javax.inject#javax.inject;1 in central
	found org.apache.avro#avro-mapred;1.7.7 in central
	found org.apache.avro#avro-ipc;1.7.7 in central
	found org.apache.avro#avro;1.7.7 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
	found org.codehaus.jackson#jackson-mapper-asl;1

In [None]:
import org.apache.spark.sql.SQLContext
val sqlContext = new SQLContext(sc)
val bankScala = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("delimiter",";").option("inferSchema", "true").load("/vagrant/zeppelin_notebooks/bank.csv") 

In [None]:
bankScala.show()

In [None]:
bankScala.printSchema()

In [None]:
bankScala.describe().show