Permalink
Browse files

Merge branch 'mesos'

  • Loading branch information...
2 parents df9ae8a + 548856a commit 97e242067b9d75abd88543c759d8fc0aebd9eb8c @haitaoyao haitaoyao committed Jan 24, 2013
Showing with 17,206 additions and 2,297 deletions.
  1. +2 −0 .gitignore
  2. +11 −0 bagel/pom.xml
  3. +2 −2 bagel/src/test/resources/log4j.properties
  4. +16 −1 core/pom.xml
  5. +32 −9 core/src/main/scala/spark/Accumulators.scala
  6. +0 −118 core/src/main/scala/spark/BoundedMemoryCache.scala
  7. +65 −0 core/src/main/scala/spark/CacheManager.scala
  8. +0 −238 core/src/main/scala/spark/CacheTracker.scala
  9. +0 −18 core/src/main/scala/spark/DaemonThreadFactory.scala
  10. +3 −5 core/src/main/scala/spark/HttpFileServer.scala
  11. +8 −1 core/src/main/scala/spark/HttpServer.scala
  12. +72 −138 core/src/main/scala/spark/KryoSerializer.scala
  13. +1 −2 core/src/main/scala/spark/Logging.scala
  14. +42 −18 core/src/main/scala/spark/MapOutputTracker.scala
  15. +55 −31 core/src/main/scala/spark/PairRDDFunctions.scala
  16. +15 −9 core/src/main/scala/spark/ParallelCollection.scala
  17. +4 −0 core/src/main/scala/spark/Partitioner.scala
  18. +159 −38 core/src/main/scala/spark/RDD.scala
  19. +105 −0 core/src/main/scala/spark/RDDCheckpointData.scala
  20. +7 −1 core/src/main/scala/spark/SequenceFileRDDFunctions.scala
  21. +10 −3 core/src/main/scala/spark/SizeEstimator.scala
  22. +100 −57 core/src/main/scala/spark/SparkContext.scala
  23. +23 −16 core/src/main/scala/spark/SparkEnv.scala
  24. +25 −0 core/src/main/scala/spark/SparkFiles.java
  25. +1 −2 core/src/main/scala/spark/TaskContext.scala
  26. +66 −59 core/src/main/scala/spark/Utils.scala
  27. +10 −0 core/src/main/scala/spark/api/java/JavaPairRDD.scala
  28. +33 −0 core/src/main/scala/spark/api/java/JavaRDDLike.scala
  29. +97 −8 core/src/main/scala/spark/api/java/JavaSparkContext.scala
  30. +11 −0 core/src/main/scala/spark/api/java/StorageLevels.java
  31. +48 −0 core/src/main/scala/spark/api/python/PythonPartitioner.scala
  32. +293 −0 core/src/main/scala/spark/api/python/PythonRDD.scala
  33. +1 −1 core/src/main/scala/spark/broadcast/Broadcast.scala
  34. +25 −1 core/src/main/scala/spark/broadcast/HttpBroadcast.scala
  35. +2 −2 core/src/main/scala/spark/deploy/DeployMessage.scala
  36. +2 −1 core/src/main/scala/spark/deploy/JobDescription.scala
  37. +78 −0 core/src/main/scala/spark/deploy/JsonProtocol.scala
  38. +1 −1 core/src/main/scala/spark/deploy/client/TestClient.scala
  39. +9 −6 core/src/main/scala/spark/deploy/master/Master.scala
  40. +42 −16 core/src/main/scala/spark/deploy/master/MasterWebUI.scala
  41. +5 −1 core/src/main/scala/spark/deploy/master/WorkerInfo.scala
  42. +7 −0 core/src/main/scala/spark/deploy/master/WorkerState.scala
  43. +0 −5 core/src/main/scala/spark/deploy/worker/ExecutorRunner.scala
  44. +2 −2 core/src/main/scala/spark/deploy/worker/Worker.scala
  45. +19 −3 core/src/main/scala/spark/deploy/worker/WorkerArguments.scala
  46. +15 −4 core/src/main/scala/spark/deploy/worker/WorkerWebUI.scala
  47. +18 −16 core/src/main/scala/spark/executor/Executor.scala
  48. +0 −3 core/src/main/scala/spark/executor/StandaloneExecutorBackend.scala
  49. +5 −2 core/src/main/scala/spark/network/Connection.scala
  50. +8 −9 core/src/main/scala/spark/network/ConnectionManager.scala
  51. +16 −8 core/src/main/scala/spark/network/ConnectionManagerTest.scala
  52. +9 −11 core/src/main/scala/spark/rdd/BlockRDD.scala
  53. +36 −11 core/src/main/scala/spark/rdd/CartesianRDD.scala
  54. +128 −0 core/src/main/scala/spark/rdd/CheckpointRDD.scala
  55. +48 −22 core/src/main/scala/spark/rdd/CoGroupedRDD.scala
  56. +35 −12 core/src/main/scala/spark/rdd/CoalescedRDD.scala
  57. +11 −6 core/src/main/scala/spark/rdd/FilteredRDD.scala
  58. +5 −5 core/src/main/scala/spark/rdd/FlatMappedRDD.scala
  59. +7 −7 core/src/main/scala/spark/rdd/GlommedRDD.scala
  60. +8 −7 core/src/main/scala/spark/rdd/HadoopRDD.scala
  61. +8 −6 core/src/main/scala/spark/rdd/MapPartitionsRDD.scala
  62. +8 −6 core/src/main/scala/spark/rdd/MapPartitionsWithSplitRDD.scala
  63. +6 −5 core/src/main/scala/spark/rdd/MappedRDD.scala
  64. +8 −11 core/src/main/scala/spark/rdd/NewHadoopRDD.scala
  65. +9 −9 core/src/main/scala/spark/rdd/PipedRDD.scala
  66. +15 −14 core/src/main/scala/spark/rdd/SampledRDD.scala
  67. +14 −14 core/src/main/scala/spark/rdd/ShuffledRDD.scala
  68. +28 −17 core/src/main/scala/spark/rdd/UnionRDD.scala
  69. +36 −24 core/src/main/scala/spark/rdd/ZippedRDD.scala
  70. +67 −23 core/src/main/scala/spark/scheduler/DAGScheduler.scala
  71. +1 −1 core/src/main/scala/spark/scheduler/MapStatus.scala
  72. +94 −8 core/src/main/scala/spark/scheduler/ResultTask.scala
  73. +15 −9 core/src/main/scala/spark/scheduler/ShuffleMapTask.scala
  74. +18 −13 core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
  75. +2 −1 core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
  76. +5 −1 core/src/main/scala/spark/scheduler/cluster/TaskSetManager.scala
  77. +25 −19 core/src/main/scala/spark/scheduler/local/LocalScheduler.scala
  78. +6 −10 core/src/main/scala/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala
  79. +3 −7 core/src/main/scala/spark/scheduler/mesos/MesosSchedulerBackend.scala
  80. +117 −114 core/src/main/scala/spark/storage/BlockManager.scala
  81. +70 −0 core/src/main/scala/spark/storage/BlockManagerId.scala
  82. +109 −618 core/src/main/scala/spark/storage/BlockManagerMaster.scala
  83. +401 −0 core/src/main/scala/spark/storage/BlockManagerMasterActor.scala
  84. +100 −0 core/src/main/scala/spark/storage/BlockManagerMessages.scala
  85. +16 −0 core/src/main/scala/spark/storage/BlockManagerSlaveActor.scala
  86. +1 −1 core/src/main/scala/spark/storage/BlockMessage.scala
  87. +6 −1 core/src/main/scala/spark/storage/BlockStore.scala
  88. +4 −1 core/src/main/scala/spark/storage/DiskStore.scala
  89. +3 −3 core/src/main/scala/spark/storage/MemoryStore.scala
  90. +60 −20 core/src/main/scala/spark/storage/StorageLevel.scala
  91. +9 −4 core/src/main/scala/spark/storage/ThreadingTest.scala
  92. +1 −0 core/src/main/scala/spark/util/AkkaUtils.scala
  93. +14 −0 core/src/main/scala/spark/util/IdGenerator.scala
  94. +44 −0 core/src/main/scala/spark/util/MetadataCleaner.scala
  95. +62 −0 core/src/main/scala/spark/util/RateLimitedOutputStream.scala
  96. +93 −0 core/src/main/scala/spark/util/TimeStampedHashMap.scala
  97. +69 −0 core/src/main/scala/spark/util/TimeStampedHashSet.scala
  98. +1 −0 core/src/main/twirl/spark/deploy/master/worker_row.scala.html
  99. +1 −0 core/src/main/twirl/spark/deploy/master/worker_table.scala.html
  100. +2 −2 core/src/test/resources/log4j.properties
  101. +0 −58 core/src/test/scala/spark/BoundedMemoryCacheSuite.scala
  102. +0 −131 core/src/test/scala/spark/CacheTrackerSuite.scala
  103. +357 −0 core/src/test/scala/spark/CheckpointSuite.scala
  104. +2 −0 core/src/test/scala/spark/ClosureCleanerSuite.scala
  105. +69 −0 core/src/test/scala/spark/DistributedSuite.scala
  106. +31 −0 core/src/test/scala/spark/DriverSuite.scala
  107. +8 −5 core/src/test/scala/spark/FileServerSuite.scala
  108. +98 −0 core/src/test/scala/spark/JavaAPISuite.java
  109. +46 −10 core/src/test/scala/spark/MapOutputTrackerSuite.scala
  110. +26 −0 core/src/test/scala/spark/PartitioningSuite.scala
  111. +51 −9 core/src/test/scala/spark/RDDSuite.scala
  112. +7 −0 core/src/test/scala/spark/ShuffleSuite.scala
  113. +26 −22 core/src/test/scala/spark/SizeEstimatorSuite.scala
  114. +42 −0 core/src/test/scala/spark/scheduler/TaskContextSuite.scala
  115. +123 −45 core/src/test/scala/spark/storage/BlockManagerSuite.scala
  116. +23 −0 core/src/test/scala/spark/util/RateLimitedOutputStreamSuite.scala
  117. +5 −3 docs/README.md
  118. +10 −1 docs/_layouts/global.html
  119. +19 −2 docs/_plugins/copy_api_dirs.rb
  120. +4 −2 docs/api.md
  121. +27 −0 docs/configuration.md
  122. +3 −1 docs/ec2-scripts.md
  123. +12 −5 docs/index.md
  124. +2 −1 docs/java-programming-guide.md
  125. +110 −0 docs/python-programming-guide.md
  126. +49 −1 docs/quick-start.md
  127. +2 −1 docs/scala-programming-guide.md
  128. +21 −22 docs/spark-standalone.md
  129. +313 −0 docs/streaming-programming-guide.md
  130. +16 −14 docs/tuning.md
  131. +28 −0 examples/pom.xml
  132. +1 −1 examples/src/main/scala/spark/examples/LocalLR.scala
  133. +20 −39 examples/src/main/scala/spark/examples/SparkALS.scala
  134. +43 −0 examples/src/main/scala/spark/streaming/examples/FlumeEventCount.scala
  135. +36 −0 examples/src/main/scala/spark/streaming/examples/HdfsWordCount.scala
  136. +50 −0 examples/src/main/scala/spark/streaming/examples/JavaFlumeEventCount.java
  137. +62 −0 examples/src/main/scala/spark/streaming/examples/JavaNetworkWordCount.java
  138. +62 −0 examples/src/main/scala/spark/streaming/examples/JavaQueueStream.java
  139. +69 −0 examples/src/main/scala/spark/streaming/examples/KafkaWordCount.scala
  140. +36 −0 examples/src/main/scala/spark/streaming/examples/NetworkWordCount.scala
  141. +39 −0 examples/src/main/scala/spark/streaming/examples/QueueStream.scala
  142. +46 −0 examples/src/main/scala/spark/streaming/examples/RawNetworkGrep.scala
  143. +85 −0 examples/src/main/scala/spark/streaming/examples/clickstream/PageViewGenerator.scala
  144. +84 −0 examples/src/main/scala/spark/streaming/examples/clickstream/PageViewStream.scala
  145. +60 −0 examples/src/main/scala/spark/streaming/examples/twitter/TwitterBasic.scala
  146. +71 −0 examples/src/main/scala/spark/streaming/examples/twitter/TwitterInputDStream.scala
  147. +43 −2 pom.xml
  148. +24 −8 project/SparkBuild.scala
  149. +39 −0 pyspark
  150. +2 −0 python/.gitignore
  151. +19 −0 python/epydoc.conf
  152. +71 −0 python/examples/als.py
  153. +54 −0 python/examples/kmeans.py
  154. +57 −0 python/examples/logistic_regression.py
  155. +21 −0 python/examples/pi.py
  156. +50 −0 python/examples/transitive_closure.py
  157. +19 −0 python/examples/wordcount.py
  158. +27 −0 python/lib/PY4J_LICENSE.txt
  159. +1 −0 python/lib/PY4J_VERSION.txt
  160. BIN python/lib/py4j0.7.egg
  161. BIN python/lib/py4j0.7.jar
  162. +27 −0 python/pyspark/__init__.py
  163. +187 −0 python/pyspark/accumulators.py
  164. +48 −0 python/pyspark/broadcast.py
  165. +974 −0 python/pyspark/cloudpickle.py
  166. +258 −0 python/pyspark/context.py
  167. +38 −0 python/pyspark/files.py
  168. +38 −0 python/pyspark/java_gateway.py
  169. +92 −0 python/pyspark/join.py
  170. +761 −0 python/pyspark/rdd.py
  171. +83 −0 python/pyspark/serializers.py
  172. +17 −0 python/pyspark/shell.py
  173. +112 −0 python/pyspark/tests.py
  174. +52 −0 python/pyspark/worker.py
  175. +35 −0 python/run-tests
  176. +1 −0 python/test_support/hello.txt
  177. +7 −0 python/test_support/userlibrary.py
  178. +14 −2 repl-bin/pom.xml
  179. +2 −2 repl-bin/src/deb/control/control
  180. +35 −0 repl/pom.xml
  181. +2 −2 repl/src/test/resources/log4j.properties
  182. +18 −9 run
  183. +3 −1 run2.cmd
  184. BIN streaming/lib/org/apache/kafka/kafka/0.7.2-spark/kafka-0.7.2-spark.jar
  185. +1 −0 streaming/lib/org/apache/kafka/kafka/0.7.2-spark/kafka-0.7.2-spark.jar.md5
  186. +1 −0 streaming/lib/org/apache/kafka/kafka/0.7.2-spark/kafka-0.7.2-spark.jar.sha1
  187. +9 −0 streaming/lib/org/apache/kafka/kafka/0.7.2-spark/kafka-0.7.2-spark.pom
  188. +1 −0 streaming/lib/org/apache/kafka/kafka/0.7.2-spark/kafka-0.7.2-spark.pom.md5
  189. +1 −0 streaming/lib/org/apache/kafka/kafka/0.7.2-spark/kafka-0.7.2-spark.pom.sha1
  190. +12 −0 streaming/lib/org/apache/kafka/kafka/maven-metadata-local.xml
  191. +1 −0 streaming/lib/org/apache/kafka/kafka/maven-metadata-local.xml.md5
  192. +1 −0 streaming/lib/org/apache/kafka/kafka/maven-metadata-local.xml.sha1
  193. +155 −0 streaming/pom.xml
  194. +118 −0 streaming/src/main/scala/spark/streaming/Checkpoint.scala
  195. +657 −0 streaming/src/main/scala/spark/streaming/DStream.scala
  196. +134 −0 streaming/src/main/scala/spark/streaming/DStreamGraph.scala
  197. +62 −0 streaming/src/main/scala/spark/streaming/Duration.scala
  198. +41 −0 streaming/src/main/scala/spark/streaming/Interval.scala
  199. +24 −0 streaming/src/main/scala/spark/streaming/Job.scala
  200. +33 −0 streaming/src/main/scala/spark/streaming/JobManager.scala
  201. +151 −0 streaming/src/main/scala/spark/streaming/NetworkInputTracker.scala
  202. +562 −0 streaming/src/main/scala/spark/streaming/PairDStreamFunctions.scala
  203. +77 −0 streaming/src/main/scala/spark/streaming/Scheduler.scala
  204. +411 −0 streaming/src/main/scala/spark/streaming/StreamingContext.scala
  205. +42 −0 streaming/src/main/scala/spark/streaming/Time.scala
  206. +91 −0 streaming/src/main/scala/spark/streaming/api/java/JavaDStream.scala
  207. +183 −0 streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala
  208. +638 −0 streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala
  209. +346 −0 streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala
  210. +40 −0 streaming/src/main/scala/spark/streaming/dstream/CoGroupedDStream.scala
  211. +19 −0 streaming/src/main/scala/spark/streaming/dstream/ConstantInputDStream.scala
  212. +102 −0 streaming/src/main/scala/spark/streaming/dstream/FileInputDStream.scala
  213. +21 −0 streaming/src/main/scala/spark/streaming/dstream/FilteredDStream.scala
  214. +20 −0 streaming/src/main/scala/spark/streaming/dstream/FlatMapValuedDStream.scala
  215. +20 −0 streaming/src/main/scala/spark/streaming/dstream/FlatMappedDStream.scala
  216. +137 −0 streaming/src/main/scala/spark/streaming/dstream/FlumeInputDStream.scala
  217. +28 −0 streaming/src/main/scala/spark/streaming/dstream/ForEachDStream.scala
  218. +17 −0 streaming/src/main/scala/spark/streaming/dstream/GlommedDStream.scala
  219. +19 −0 streaming/src/main/scala/spark/streaming/dstream/InputDStream.scala
  220. +200 −0 streaming/src/main/scala/spark/streaming/dstream/KafkaInputDStream.scala
  221. +21 −0 streaming/src/main/scala/spark/streaming/dstream/MapPartitionedDStream.scala
  222. +21 −0 streaming/src/main/scala/spark/streaming/dstream/MapValuedDStream.scala
  223. +20 −0 streaming/src/main/scala/spark/streaming/dstream/MappedDStream.scala
  224. +254 −0 streaming/src/main/scala/spark/streaming/dstream/NetworkInputDStream.scala
  225. +41 −0 streaming/src/main/scala/spark/streaming/dstream/QueueInputDStream.scala
  226. +91 −0 streaming/src/main/scala/spark/streaming/dstream/RawInputDStream.scala
  227. +149 −0 streaming/src/main/scala/spark/streaming/dstream/ReducedWindowedDStream.scala
  228. +27 −0 streaming/src/main/scala/spark/streaming/dstream/ShuffledDStream.scala
  229. +103 −0 streaming/src/main/scala/spark/streaming/dstream/SocketInputDStream.scala
  230. +84 −0 streaming/src/main/scala/spark/streaming/dstream/StateDStream.scala
  231. +19 −0 streaming/src/main/scala/spark/streaming/dstream/TransformedDStream.scala
  232. +40 −0 streaming/src/main/scala/spark/streaming/dstream/UnionDStream.scala
  233. +40 −0 streaming/src/main/scala/spark/streaming/dstream/WindowedDStream.scala
  234. +84 −0 streaming/src/main/scala/spark/streaming/util/Clock.scala
  235. +98 −0 streaming/src/main/scala/spark/streaming/util/RawTextHelper.scala
  236. +60 −0 streaming/src/main/scala/spark/streaming/util/RawTextSender.scala
  237. +75 −0 streaming/src/main/scala/spark/streaming/util/RecurringTimer.scala
  238. +1,029 −0 streaming/src/test/java/spark/streaming/JavaAPISuite.java
  239. +65 −0 streaming/src/test/java/spark/streaming/JavaTestUtils.scala
  240. +11 −0 streaming/src/test/resources/log4j.properties
  241. +218 −0 streaming/src/test/scala/spark/streaming/BasicOperationsSuite.scala
  242. +210 −0 streaming/src/test/scala/spark/streaming/CheckpointSuite.scala
  243. +191 −0 streaming/src/test/scala/spark/streaming/FailureSuite.scala
  244. +355 −0 streaming/src/test/scala/spark/streaming/InputStreamsSuite.scala
  245. +291 −0 streaming/src/test/scala/spark/streaming/TestSuiteBase.scala
  246. +300 −0 streaming/src/test/scala/spark/streaming/WindowOperationsSuite.scala
View
@@ -12,6 +12,7 @@ third_party/libmesos.so
third_party/libmesos.dylib
conf/java-opts
conf/spark-env.sh
+conf/streaming-env.sh
conf/log4j.properties
docs/_site
docs/api
@@ -31,6 +32,7 @@ project/plugins/src_managed/
logs/
log/
spark-tests.log
+streaming-tests.log
dependency-reduced-pom.xml
.ensime
.ensime_lucene
View
@@ -45,6 +45,11 @@
<profiles>
<profile>
<id>hadoop1</id>
+ <activation>
+ <property>
+ <name>!hadoopVersion</name>
+ </property>
+ </activation>
<dependencies>
<dependency>
<groupId>org.spark-project</groupId>
@@ -72,6 +77,12 @@
</profile>
<profile>
<id>hadoop2</id>
+ <activation>
+ <property>
+ <name>hadoopVersion</name>
+ <value>2</value>
+ </property>
+ </activation>
<dependencies>
<dependency>
<groupId>org.spark-project</groupId>
@@ -1,8 +1,8 @@
-# Set everything to be logged to the console
+# Set everything to be logged to the file bagel/target/unit-tests.log
log4j.rootCategory=INFO, file
log4j.appender.file=org.apache.log4j.FileAppender
log4j.appender.file.append=false
-log4j.appender.file.file=spark-tests.log
+log4j.appender.file.file=bagel/target/unit-tests.log
log4j.appender.file.layout=org.apache.log4j.PatternLayout
log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
View
@@ -71,6 +71,10 @@
<groupId>cc.spray</groupId>
<artifactId>spray-server</artifactId>
</dependency>
+ <dependency>
+ <groupId>cc.spray</groupId>
+ <artifactId>spray-json_${scala.version}</artifactId>
+ </dependency>
<dependency>
<groupId>org.tomdz.twirl</groupId>
<artifactId>twirl-api</artifactId>
@@ -159,6 +163,11 @@
<profiles>
<profile>
<id>hadoop1</id>
+ <activation>
+ <property>
+ <name>!hadoopVersion</name>
+ </property>
+ </activation>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
@@ -211,6 +220,12 @@
</profile>
<profile>
<id>hadoop2</id>
+ <activation>
+ <property>
+ <name>hadoopVersion</name>
+ <value>2</value>
+ </property>
+ </activation>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
@@ -267,4 +282,4 @@
</build>
</profile>
</profiles>
-</project>
+</project>
@@ -25,8 +25,7 @@ class Accumulable[R, T] (
extends Serializable {
val id = Accumulators.newId
- @transient
- private var value_ = initialValue // Current value on master
+ @transient private var value_ = initialValue // Current value on master
val zero = param.zero(initialValue) // Zero value to be passed to workers
var deserialized = false
@@ -38,20 +37,37 @@ class Accumulable[R, T] (
*/
def += (term: T) { value_ = param.addAccumulator(value_, term) }
+ /**
+ * Add more data to this accumulator / accumulable
+ * @param term the data to add
+ */
+ def add(term: T) { value_ = param.addAccumulator(value_, term) }
+
/**
* Merge two accumulable objects together
- *
+ *
* Normally, a user will not want to use this version, but will instead call `+=`.
- * @param term the other Accumulable that will get merged with this
+ * @param term the other `R` that will get merged with this
*/
def ++= (term: R) { value_ = param.addInPlace(value_, term)}
+ /**
+ * Merge two accumulable objects together
+ *
+ * Normally, a user will not want to use this version, but will instead call `add`.
+ * @param term the other `R` that will get merged with this
+ */
+ def merge(term: R) { value_ = param.addInPlace(value_, term)}
+
/**
* Access the accumulator's current value; only allowed on master.
*/
- def value = {
- if (!deserialized) value_
- else throw new UnsupportedOperationException("Can't read accumulator value in task")
+ def value: R = {
+ if (!deserialized) {
+ value_
+ } else {
+ throw new UnsupportedOperationException("Can't read accumulator value in task")
+ }
}
/**
@@ -68,10 +84,17 @@ class Accumulable[R, T] (
/**
* Set the accumulator's value; only allowed on master.
*/
- def value_= (r: R) {
- if (!deserialized) value_ = r
+ def value_= (newValue: R) {
+ if (!deserialized) value_ = newValue
else throw new UnsupportedOperationException("Can't assign accumulator value in task")
}
+
+ /**
+ * Set the accumulator's value; only allowed on master
+ */
+ def setValue(newValue: R) {
+ this.value = newValue
+ }
// Called by Java when deserializing an object
private def readObject(in: ObjectInputStream) {
@@ -1,118 +0,0 @@
-package spark
-
-import java.util.LinkedHashMap
-
-/**
- * An implementation of Cache that estimates the sizes of its entries and attempts to limit its
- * total memory usage to a fraction of the JVM heap. Objects' sizes are estimated using
- * SizeEstimator, which has limitations; most notably, we will overestimate total memory used if
- * some cache entries have pointers to a shared object. Nonetheless, this Cache should work well
- * when most of the space is used by arrays of primitives or of simple classes.
- */
-private[spark] class BoundedMemoryCache(maxBytes: Long) extends Cache with Logging {
- logInfo("BoundedMemoryCache.maxBytes = " + maxBytes)
-
- def this() {
- this(BoundedMemoryCache.getMaxBytes)
- }
-
- private var currentBytes = 0L
- private val map = new LinkedHashMap[(Any, Int), Entry](32, 0.75f, true)
-
- override def get(datasetId: Any, partition: Int): Any = {
- synchronized {
- val entry = map.get((datasetId, partition))
- if (entry != null) {
- entry.value
- } else {
- null
- }
- }
- }
-
- override def put(datasetId: Any, partition: Int, value: Any): CachePutResponse = {
- val key = (datasetId, partition)
- logInfo("Asked to add key " + key)
- val size = estimateValueSize(key, value)
- synchronized {
- if (size > getCapacity) {
- return CachePutFailure()
- } else if (ensureFreeSpace(datasetId, size)) {
- logInfo("Adding key " + key)
- map.put(key, new Entry(value, size))
- currentBytes += size
- logInfo("Number of entries is now " + map.size)
- return CachePutSuccess(size)
- } else {
- logInfo("Didn't add key " + key + " because we would have evicted part of same dataset")
- return CachePutFailure()
- }
- }
- }
-
- override def getCapacity: Long = maxBytes
-
- /**
- * Estimate sizeOf 'value'
- */
- private def estimateValueSize(key: (Any, Int), value: Any) = {
- val startTime = System.currentTimeMillis
- val size = SizeEstimator.estimate(value.asInstanceOf[AnyRef])
- val timeTaken = System.currentTimeMillis - startTime
- logInfo("Estimated size for key %s is %d".format(key, size))
- logInfo("Size estimation for key %s took %d ms".format(key, timeTaken))
- size
- }
-
- /**
- * Remove least recently used entries from the map until at least space bytes are free, in order
- * to make space for a partition from the given dataset ID. If this cannot be done without
- * evicting other data from the same dataset, returns false; otherwise, returns true. Assumes
- * that a lock is held on the BoundedMemoryCache.
- */
- private def ensureFreeSpace(datasetId: Any, space: Long): Boolean = {
- logInfo("ensureFreeSpace(%s, %d) called with curBytes=%d, maxBytes=%d".format(
- datasetId, space, currentBytes, maxBytes))
- val iter = map.entrySet.iterator // Will give entries in LRU order
- while (maxBytes - currentBytes < space && iter.hasNext) {
- val mapEntry = iter.next()
- val (entryDatasetId, entryPartition) = mapEntry.getKey
- if (entryDatasetId == datasetId) {
- // Cannot make space without removing part of the same dataset, or a more recently used one
- return false
- }
- reportEntryDropped(entryDatasetId, entryPartition, mapEntry.getValue)
- currentBytes -= mapEntry.getValue.size
- iter.remove()
- }
- return true
- }
-
- protected def reportEntryDropped(datasetId: Any, partition: Int, entry: Entry) {
- logInfo("Dropping key (%s, %d) of size %d to make space".format(datasetId, partition, entry.size))
- // TODO: remove BoundedMemoryCache
-
- val (keySpaceId, innerDatasetId) = datasetId.asInstanceOf[(Any, Any)]
- innerDatasetId match {
- case rddId: Int =>
- SparkEnv.get.cacheTracker.dropEntry(rddId, partition)
- case broadcastUUID: java.util.UUID =>
- // TODO: Maybe something should be done if the broadcasted variable falls out of cache
- case _ =>
- }
- }
-}
-
-// An entry in our map; stores a cached object and its size in bytes
-private[spark] case class Entry(value: Any, size: Long)
-
-private[spark] object BoundedMemoryCache {
- /**
- * Get maximum cache capacity from system configuration
- */
- def getMaxBytes: Long = {
- val memoryFractionToUse = System.getProperty("spark.boundedMemoryCache.memoryFraction", "0.66").toDouble
- (Runtime.getRuntime.maxMemory * memoryFractionToUse).toLong
- }
-}
-
@@ -0,0 +1,65 @@
+package spark
+
+import scala.collection.mutable.{ArrayBuffer, HashSet}
+import spark.storage.{BlockManager, StorageLevel}
+
+
+/** Spark class responsible for passing RDDs split contents to the BlockManager and making
+ sure a node doesn't load two copies of an RDD at once.
+ */
+private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
+ private val loading = new HashSet[String]
+
+ /** Gets or computes an RDD split. Used by RDD.iterator() when a RDD is cached. */
+ def getOrCompute[T](rdd: RDD[T], split: Split, context: TaskContext, storageLevel: StorageLevel)
+ : Iterator[T] = {
+ val key = "rdd_%d_%d".format(rdd.id, split.index)
+ logInfo("Cache key is " + key)
+ blockManager.get(key) match {
+ case Some(cachedValues) =>
+ // Split is in cache, so just return its values
+ logInfo("Found partition in cache!")
+ return cachedValues.asInstanceOf[Iterator[T]]
+
+ case None =>
+ // Mark the split as loading (unless someone else marks it first)
+ loading.synchronized {
+ if (loading.contains(key)) {
+ logInfo("Loading contains " + key + ", waiting...")
+ while (loading.contains(key)) {
+ try {loading.wait()} catch {case _ =>}
+ }
+ logInfo("Loading no longer contains " + key + ", so returning cached result")
+ // See whether someone else has successfully loaded it. The main way this would fail
+ // is for the RDD-level cache eviction policy if someone else has loaded the same RDD
+ // partition but we didn't want to make space for it. However, that case is unlikely
+ // because it's unlikely that two threads would work on the same RDD partition. One
+ // downside of the current code is that threads wait serially if this does happen.
+ blockManager.get(key) match {
+ case Some(values) =>
+ return values.asInstanceOf[Iterator[T]]
+ case None =>
+ logInfo("Whoever was loading " + key + " failed; we'll try it ourselves")
+ loading.add(key)
+ }
+ } else {
+ loading.add(key)
+ }
+ }
+ try {
+ // If we got here, we have to load the split
+ val elements = new ArrayBuffer[Any]
+ logInfo("Computing partition " + split)
+ elements ++= rdd.compute(split, context)
+ // Try to put this block in the blockManager
+ blockManager.put(key, elements, storageLevel, true)
+ return elements.iterator.asInstanceOf[Iterator[T]]
+ } finally {
+ loading.synchronized {
+ loading.remove(key)
+ loading.notifyAll()
+ }
+ }
+ }
+ }
+}
Oops, something went wrong.

0 comments on commit 97e2420

Please sign in to comment.