Add headers for NamedRddSupport and JobServerNamedRdds

ooyala · Dec 16, 2013 · f564dab · f564dab
1 parent 231f771
commit f564dab
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 0 deletions.
diff --git a/jobserver/src/main/scala/spark.jobserver/JobServerNamedRdds.scala b/jobserver/src/main/scala/spark.jobserver/JobServerNamedRdds.scala
@@ -6,6 +6,12 @@ import akka.util.Timeout
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 
+/**
+ * An implementation of [[NamedRddSupport]] API for the Job Server.
+ * Note that this contains code that executes on the same thread as the job.
+ * Another part of this system is the rddManager, which is an actor which manages the concurrent
+ * update of shared RDD state.
+ */
 class JobServerNamedRdds(val rddManager: ActorRef) extends NamedRdds {
   import RddManagerActorMessages._
 

diff --git a/jobserver/src/main/scala/spark.jobserver/NamedRddSupport.scala b/jobserver/src/main/scala/spark.jobserver/NamedRddSupport.scala
@@ -6,6 +6,16 @@ import java.util.concurrent.atomic.AtomicReference
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 
+/**
+ * NamedRdds - a trait that gives you safe, concurrent creation and access to named RDDs
+ * (the native SparkContext interface only has access to RDDs by numbers).
+ * It facilitates easy sharing of RDDs amongst jobs sharing the same SparkContext.
+ * If two jobs simultaneously tries to create an RDD with the same name,
+ * only one will win and the other will retrieve the same one.
+ *
+ * Note that to take advantage of NamedRddSupport, a job must mix this in and use the APIs here instead of
+ * the native RDD `cache()`, otherwise we will not know about the names.
+ */
 trait NamedRdds {
   // Default timeout is 60 seconds. Hopefully that is enough to let most RDD generator functions finish.
   val defaultTimeout = akka.util.Timeout(akka.util.Duration(60, java.util.concurrent.TimeUnit.SECONDS))