-
Notifications
You must be signed in to change notification settings - Fork 51
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-25299] ShuffleLocation/FetchFailed integrations with scheduler #548
Changes from 13 commits
36d4898
48f52c8
d161feb
070151b
190e2de
a34a103
93c2449
fcb6ac7
715a7ed
b68254c
df8d457
97f6ff3
09eeb13
ba78f9a
f50a309
b7c1e2a
de824d4
7794b68
9604889
58c8ef2
506a3f7
118a2bf
bf8aeab
994ff5a
7758a90
cebe386
056c87b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,8 +17,10 @@ | |
package org.apache.spark.api.shuffle; | ||
|
||
import org.apache.spark.annotation.Experimental; | ||
import org.apache.spark.api.java.Optional; | ||
|
||
import java.io.Serializable; | ||
import java.util.List; | ||
|
||
/** | ||
* Represents metadata about where shuffle blocks were written in a single map task. | ||
|
@@ -35,5 +37,11 @@ public interface MapShuffleLocations extends Serializable { | |
/** | ||
* Get the location for a given shuffle block written by this map task. | ||
*/ | ||
ShuffleLocation getLocationForBlock(int reduceId); | ||
List<ShuffleLocation> getLocationsForBlock(int reduceId); | ||
|
||
/** | ||
* Deletes a host or a host/port combination from this MapShuffleLocations. | ||
* Returns true if the removal of this ShuffleLocation results in missing partitions. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Think this can be worded more clearly:
For something like this it's better to be verbose, although we do expect only those who really know what they're doing to be implementing these APIs. |
||
*/ | ||
boolean removeShuffleLocation(String host, Optional<Integer> port); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,6 +43,7 @@ import org.apache.spark.network.util.JavaUtils | |
import org.apache.spark.partial.{ApproximateActionListener, ApproximateEvaluator, PartialResult} | ||
import org.apache.spark.rdd.{DeterministicLevel, RDD, RDDCheckpointData} | ||
import org.apache.spark.rpc.RpcTimeout | ||
import org.apache.spark.shuffle.sort.io.DefaultShuffleDataIO | ||
import org.apache.spark.storage._ | ||
import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat | ||
import org.apache.spark.util._ | ||
|
@@ -1478,7 +1479,7 @@ private[spark] class DAGScheduler( | |
} | ||
} | ||
|
||
case FetchFailed(bmAddress, shuffleId, mapId, _, failureMessage) => | ||
case FetchFailed(shuffleLocations, shuffleId, mapId, _, failureMessage) => | ||
val failedStage = stageIdToStage(task.stageId) | ||
val mapStage = shuffleIdToMapStage(shuffleId) | ||
|
||
|
@@ -1511,7 +1512,7 @@ private[spark] class DAGScheduler( | |
mapOutputTracker.unregisterAllMapOutput(shuffleId) | ||
} else if (mapId != -1) { | ||
// Mark the map whose fetch failed as broken in the map stage | ||
mapOutputTracker.unregisterMapOutput(shuffleId, mapId, bmAddress) | ||
mapOutputTracker.unregisterMapOutput(shuffleId, mapId, shuffleLocations) | ||
} | ||
|
||
if (failedStage.rdd.isBarrier()) { | ||
|
@@ -1626,22 +1627,39 @@ private[spark] class DAGScheduler( | |
} | ||
|
||
// TODO: mark the executor as failed only if there were lots of fetch failures on it | ||
if (bmAddress != null) { | ||
val hostToUnregisterOutputs = if (env.blockManager.externalShuffleServiceEnabled && | ||
unRegisterOutputOnHostOnFetchFailure) { | ||
// We had a fetch failure with the external shuffle service, so we | ||
// assume all shuffle data on the node is bad. | ||
Some(bmAddress.host) | ||
} else { | ||
// Unregister shuffle data just for one executor (we don't have any | ||
// reason to believe shuffle data has been lost for the entire host). | ||
None | ||
} | ||
removeExecutorAndUnregisterOutputs( | ||
execId = bmAddress.executorId, | ||
fileLost = true, | ||
hostToUnregisterOutputs = hostToUnregisterOutputs, | ||
maybeEpoch = Some(task.epoch)) | ||
if (shuffleLocations != null) { | ||
val toRemoveHost = | ||
if (env.conf.get(config.SHUFFLE_IO_PLUGIN_CLASS) == | ||
classOf[DefaultShuffleDataIO].getName) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we're going to be doing equality tests on these classes, we should make sure they are |
||
env.blockManager.externalShuffleServiceEnabled && | ||
unRegisterOutputOnHostOnFetchFailure | ||
} else { | ||
true // always remove for remote shuffle storage | ||
} | ||
|
||
shuffleLocations.foreach(location => { | ||
var epochAllowsRemoval = false | ||
// If the location belonged to an executor, remove all outputs on the executor | ||
val maybeExecId = location.execId() | ||
val currentEpoch = Some(task.epoch).getOrElse(mapOutputTracker.getEpoch) | ||
if (maybeExecId.isPresent) { | ||
val execId = maybeExecId.get() | ||
if (!failedEpoch.contains(execId) || failedEpoch(execId) < currentEpoch) { | ||
failedEpoch(execId) = currentEpoch | ||
epochAllowsRemoval = true | ||
blockManagerMaster.removeExecutor(execId) | ||
mapOutputTracker.removeOutputsOnExecutor(execId) | ||
} | ||
} else { | ||
// If the location doesn't belong to an executor, the epoch doesn't matter | ||
epochAllowsRemoval = true | ||
} | ||
|
||
if (toRemoveHost && epochAllowsRemoval) { | ||
mapOutputTracker.removeOutputsOnHost(location.host()) | ||
} | ||
}) | ||
clearCacheLocs() | ||
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mentioned this on the doc, but I'm skeptical about supporting different locations for each (map, reduce) block, instead of just replicating the entire output of one map task to the same places. I don't think I properly understood that part even before this change ... I'll need to look through this more carefully to figure out what the effect of that would be, in particular how much bookkeeping is required on the driver.