palantir · yifeih · May 9, 2019 · May 9, 2019 · May 9, 2019 · May 10, 2019
diff --git a/core/src/main/java/org/apache/spark/api/shuffle/MapShuffleLocations.java b/core/src/main/java/org/apache/spark/api/shuffle/MapShuffleLocations.java
@@ -17,8 +17,10 @@
 package org.apache.spark.api.shuffle;
 
 import org.apache.spark.annotation.Experimental;
+import org.apache.spark.api.java.Optional;
 
 import java.io.Serializable;
+import java.util.List;
 
 /**
  * Represents metadata about where shuffle blocks were written in a single map task.
@@ -35,5 +37,30 @@ public interface MapShuffleLocations extends Serializable {
   /**
    * Get the location for a given shuffle block written by this map task.
    */
-  ShuffleLocation getLocationForBlock(int reduceId);
+  List<ShuffleLocation> getLocationsForBlock(int reduceId);
+
+  /**
+   * Mark a location for a block in this map output as unreachable, and thus partitions can no
+   * longer be fetched from that location.
+   * <p>
+   * This is called by the scheduler when it detects that a block could not be fetched from the
+   * file server located at this host and port.
+   * <p>
+   * This should return true if there exists a data loss from the removal of this shuffle
+   * location. Otherwise, if all partitions can still be fetched from alternative locations,
+   * this should return false.
+   */
+  boolean invalidateShuffleLocation(String host, Optional<Integer> port);
+
+  /**
+   * Mark all locations within this MapShuffleLocations with this execId as unreachable.
+   * <p>
+   * This is called by the scheduler when it detects that an executor cannot be reached to
+   * fetch file data.
+   * <p>
+   * This should return true if there exists a data loss from the removal of shuffle locations
+   * with this execId. Otherwise, if all partitions can still be fetched form alternative locaitons,
+   * this should return false.
+   */
+  boolean invalidateShuffleLocation(String executorId);
 }
diff --git a/core/src/main/java/org/apache/spark/api/shuffle/ShuffleBlockInfo.java b/core/src/main/java/org/apache/spark/api/shuffle/ShuffleBlockInfo.java
@@ -17,8 +17,6 @@
 
 package org.apache.spark.api.shuffle;
 
-import org.apache.spark.api.java.Optional;
-
 import java.util.Objects;
 
 /**
@@ -31,10 +29,10 @@ public class ShuffleBlockInfo {
   private final int mapId;
   private final int reduceId;
   private final long length;
-  private final Optional<ShuffleLocation> shuffleLocation;
+  private final ShuffleLocation[] shuffleLocation;
 
   public ShuffleBlockInfo(int shuffleId, int mapId, int reduceId, long length,
-    Optional<ShuffleLocation> shuffleLocation) {
+    ShuffleLocation[] shuffleLocation) {
     this.shuffleId = shuffleId;
     this.mapId = mapId;
     this.reduceId = reduceId;
@@ -58,7 +56,7 @@ public long getLength() {
     return length;
   }
 
-  public Optional<ShuffleLocation> getShuffleLocation() {
+  public ShuffleLocation[] getShuffleLocation() {
     return shuffleLocation;
   }
 

diff --git a/core/src/main/java/org/apache/spark/api/shuffle/ShuffleDriverComponents.java b/core/src/main/java/org/apache/spark/api/shuffle/ShuffleDriverComponents.java
@@ -30,4 +30,12 @@ public interface ShuffleDriverComponents {
   void cleanupApplication() throws IOException;
 
   void removeShuffleData(int shuffleId, boolean blocking) throws IOException;
+
+  /**
+   * Whether to unregister other map statuses on the same hosts or executors
+   * when a shuffle task returns a {@link org.apache.spark.FetchFailed}.
+   */
+  default boolean unregisterOtherMapStatusesOnFetchFailure() {
+    return false;
+  }
 }
diff --git a/core/src/main/java/org/apache/spark/api/shuffle/ShuffleLocation.java b/core/src/main/java/org/apache/spark/api/shuffle/ShuffleLocation.java
@@ -17,8 +17,33 @@
 
 package org.apache.spark.api.shuffle;
 
+import org.apache.spark.api.java.Optional;
+
 /**
  * Marker interface representing a location of a shuffle block. Implementations of shuffle readers
  * and writers are expected to cast this down to an implementation-specific representation.
  */
-public interface ShuffleLocation {}
+public abstract class ShuffleLocation {
+  /**
+   * The host and port on which the shuffle block is located.
+   */
+  public abstract String host();
+  public abstract int port();
+
+  /**
+   * The executor on which the ShuffleLocation is located. Returns {@link Optional#empty()} if
+   * location is not associated with an executor.
+   */
+  public Optional<String> execId() {
+    return Optional.empty();
+  }
+
+  @Override
+  public String toString() {
+    String shuffleLocation = String.format("ShuffleLocation %s:%d", host(), port());
+    if (execId().isPresent()) {
+      return String.format("%s (execId: %s)", shuffleLocation, execId().get());
+    }
+    return shuffleLocation;
+  }
+}
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/DefaultMapShuffleLocations.java b/core/src/main/java/org/apache/spark/shuffle/sort/DefaultMapShuffleLocations.java
@@ -17,17 +17,21 @@
 
 package org.apache.spark.shuffle.sort;
 
+import com.fasterxml.jackson.annotation.JsonIgnore;
 import com.google.common.cache.CacheBuilder;
 import com.google.common.cache.CacheLoader;
 import com.google.common.cache.LoadingCache;
 
+import com.google.common.collect.Lists;
+import org.apache.spark.api.java.Optional;
 import org.apache.spark.api.shuffle.MapShuffleLocations;
 import org.apache.spark.api.shuffle.ShuffleLocation;
 import org.apache.spark.storage.BlockManagerId;
 
+import java.util.List;
 import java.util.Objects;
 
-public class DefaultMapShuffleLocations implements MapShuffleLocations, ShuffleLocation {
+public class DefaultMapShuffleLocations extends ShuffleLocation implements MapShuffleLocations {
 
   /**
    * We borrow the cache size from the BlockManagerId's cache - around 1MB, which should be
@@ -45,18 +49,34 @@ public DefaultMapShuffleLocations load(BlockManagerId blockManagerId) {
               });
 
   private final BlockManagerId location;
+  @JsonIgnore
+  private final List<ShuffleLocation> locationsArray;
 
   public DefaultMapShuffleLocations(BlockManagerId blockManagerId) {
     this.location = blockManagerId;
+    this.locationsArray = Lists.newArrayList(this);
   }
 
   public static DefaultMapShuffleLocations get(BlockManagerId blockManagerId) {
     return DEFAULT_SHUFFLE_LOCATIONS_CACHE.getUnchecked(blockManagerId);
   }
 
   @Override
-  public ShuffleLocation getLocationForBlock(int reduceId) {
-    return this;
+  public List<ShuffleLocation> getLocationsForBlock(int reduceId) {
+    return locationsArray;
+  }
+
+  @Override
+  public boolean invalidateShuffleLocation(String host, Optional<Integer> port) {
+    if (port.isPresent()) {
+      return this.host().equals(host) && this.port() == port.get();
+    }
+    return this.host().equals(host);
+  }
+
+  @Override
+  public boolean invalidateShuffleLocation(String executorId) {
+    return location.executorId().equals(executorId);
   }
 
   public BlockManagerId getBlockManagerId() {
@@ -73,4 +93,19 @@ public boolean equals(Object other) {
   public int hashCode() {
     return Objects.hashCode(location);
   }
+
+  @Override
+  public String host() {
+    return location.host();
+  }
+
+  @Override
+  public int port() {
+    return location.port();
+  }
+
+  @Override
+  public Optional<String> execId() {
+    return Optional.of(location.executorId());
+  }
 }
diff --git a/...src/main/java/org/apache/spark/shuffle/sort/lifecycle/DefaultShuffleDriverComponents.java b/...src/main/java/org/apache/spark/shuffle/sort/lifecycle/DefaultShuffleDriverComponents.java
@@ -46,6 +46,11 @@ public void removeShuffleData(int shuffleId, boolean blocking) throws IOExceptio
     blockManagerMaster.removeShuffle(shuffleId, blocking);
   }
 
+  @Override
+  public boolean unregisterOtherMapStatusesOnFetchFailure() {
+    return true;
+  }
+
   private void checkInitialized() {
     if (blockManagerMaster == null) {
       throw new IllegalStateException("Driver components must be initialized before using");

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -28,6 +28,7 @@ import scala.concurrent.duration.Duration
 import scala.reflect.ClassTag
 import scala.util.control.NonFatal
 
+import org.apache.spark.api.java.Optional
 import org.apache.spark.api.shuffle.{MapShuffleLocations, ShuffleLocation}
 import org.apache.spark.broadcast.{Broadcast, BroadcastManager}
 import org.apache.spark.internal.Logging
@@ -102,11 +103,23 @@ private class ShuffleStatus(numPartitions: Int) {
    * This is a no-op if there is no registered map output or if the registered output is from a
    * different block manager.
    */
-  def removeMapOutput(mapId: Int, bmAddress: BlockManagerId): Unit = synchronized {
-    if (mapStatuses(mapId) != null && mapStatuses(mapId).location == bmAddress) {
-      _numAvailableOutputs -= 1
-      mapStatuses(mapId) = null
-      invalidateSerializedMapOutputStatusCache()
+  def removeMapOutput(mapId: Int, shuffleLocations: Seq[ShuffleLocation]): Unit = synchronized {
+    if (mapStatuses(mapId) != null) {
+      var shouldDelete = false
+      if (shuffleLocations.isEmpty) {
+        shouldDelete = true
+      } else {
+        shuffleLocations.foreach { location =>
+          shouldDelete = mapStatuses(mapId)
+            .mapShuffleLocations
+            .invalidateShuffleLocation(location.host(), Optional.of(location.port()))
+        }
+      }
+      if (shouldDelete) {
+        _numAvailableOutputs -= 1
+        mapStatuses(mapId) = null
+        invalidateSerializedMapOutputStatusCache()
+      }
     }
   }
 
@@ -115,7 +128,14 @@ private class ShuffleStatus(numPartitions: Int) {
    * outputs which are served by an external shuffle server (if one exists).
    */
   def removeOutputsOnHost(host: String): Unit = {
-    removeOutputsByFilter(x => x.host == host)
+    for (mapId <- 0 until mapStatuses.length) {
+      if (mapStatuses(mapId) != null &&
+        mapStatuses(mapId).mapShuffleLocations.invalidateShuffleLocation(host, Optional.empty())) {
+        _numAvailableOutputs -= 1
+        mapStatuses(mapId) = null
+        invalidateSerializedMapOutputStatusCache()
+      }
+    }
   }
 
   /**
@@ -124,7 +144,14 @@ private class ShuffleStatus(numPartitions: Int) {
    * still registered with that execId.
    */
   def removeOutputsOnExecutor(execId: String): Unit = synchronized {
-    removeOutputsByFilter(x => x.executorId == execId)
+    for (mapId <- 0 until mapStatuses.length) {
+      if (mapStatuses(mapId) != null &&
+        mapStatuses(mapId).mapShuffleLocations.invalidateShuffleLocation(execId)) {
+        _numAvailableOutputs -= 1
+        mapStatuses(mapId) = null
+        invalidateSerializedMapOutputStatusCache()
+      }
+    }
   }
 
   /**
@@ -283,7 +310,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
 
   // For testing
   def getMapSizesByShuffleLocation(shuffleId: Int, reduceId: Int)
-      : Iterator[(Option[ShuffleLocation], Seq[(BlockId, Long)])] = {
+      : Iterator[(Seq[ShuffleLocation], Seq[(BlockId, Long)])] = {
     getMapSizesByShuffleLocation(shuffleId, reduceId, reduceId + 1)
   }
 
@@ -297,7 +324,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
    *         describing the shuffle blocks that are stored at that block manager.
    */
   def getMapSizesByShuffleLocation(shuffleId: Int, startPartition: Int, endPartition: Int)
-      : Iterator[(Option[ShuffleLocation], Seq[(BlockId, Long)])]
+      : Iterator[(Seq[ShuffleLocation], Seq[(BlockId, Long)])]
 
   /**
    * Deletes map output status information for the specified shuffle stage.
@@ -424,10 +451,10 @@ private[spark] class MapOutputTrackerMaster(
   }
 
   /** Unregister map output information of the given shuffle, mapper and block manager */
-  def unregisterMapOutput(shuffleId: Int, mapId: Int, bmAddress: BlockManagerId) {
+  def unregisterMapOutput(shuffleId: Int, mapId: Int, shuffleLocations: Seq[ShuffleLocation]) {
     shuffleStatuses.get(shuffleId) match {
       case Some(shuffleStatus) =>
-        shuffleStatus.removeMapOutput(mapId, bmAddress)
+        shuffleStatus.removeMapOutput(mapId, shuffleLocations)
         incrementEpoch()
       case None =>
         throw new SparkException("unregisterMapOutput called for nonexistent shuffle ID")
@@ -647,7 +674,7 @@ private[spark] class MapOutputTrackerMaster(
   // Get blocks sizes by executor Id. Note that zero-sized blocks are excluded in the result.
   // This method is only called in local-mode.
   def getMapSizesByShuffleLocation(shuffleId: Int, startPartition: Int, endPartition: Int)
-      : Iterator[(Option[ShuffleLocation], Seq[(BlockId, Long)])] = {
+      : Iterator[(Seq[ShuffleLocation], Seq[(BlockId, Long)])] = {
     logDebug(s"Fetching outputs for shuffle $shuffleId, partitions $startPartition-$endPartition")
     shuffleStatuses.get(shuffleId) match {
       case Some (shuffleStatus) =>
@@ -684,7 +711,7 @@ private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTr
 
   // Get blocks sizes by executor Id. Note that zero-sized blocks are excluded in the result.
   override def getMapSizesByShuffleLocation(shuffleId: Int, startPartition: Int, endPartition: Int)
-      : Iterator[(Option[ShuffleLocation], Seq[(BlockId, Long)])] = {
+      : Iterator[(Seq[ShuffleLocation], Seq[(BlockId, Long)])] = {
     logDebug(s"Fetching outputs for shuffle $shuffleId, partitions $startPartition-$endPartition")
     val statuses = getStatuses(shuffleId)
     try {
@@ -873,9 +900,9 @@ private[spark] object MapOutputTracker extends Logging {
       shuffleId: Int,
       startPartition: Int,
       endPartition: Int,
-      statuses: Array[MapStatus]): Iterator[(Option[ShuffleLocation], Seq[(BlockId, Long)])] = {
+      statuses: Array[MapStatus]): Iterator[(Seq[ShuffleLocation], Seq[(BlockId, Long)])] = {
     assert (statuses != null)
-    val splitsByAddress = new HashMap[Option[ShuffleLocation], ListBuffer[(BlockId, Long)]]
+    val splitsByAddress = new HashMap[Seq[ShuffleLocation], ListBuffer[(BlockId, Long)]]
     for ((status, mapId) <- statuses.iterator.zipWithIndex) {
       if (status == null) {
         val errorMessage = s"Missing an output location for shuffle $shuffleId"
@@ -885,12 +912,13 @@ private[spark] object MapOutputTracker extends Logging {
         for (part <- startPartition until endPartition) {
           val size = status.getSizeForBlock(part)
           if (size != 0) {
-            if (status.mapShuffleLocations == null) {
-              splitsByAddress.getOrElseUpdate(Option.empty, ListBuffer()) +=
+            if (status.mapShuffleLocations == null
+              || status.mapShuffleLocations.getLocationsForBlock(part).isEmpty) {
+              splitsByAddress.getOrElseUpdate(Seq.empty, ListBuffer()) +=
                 ((ShuffleBlockId(shuffleId, mapId, part), size))
             } else {
-              val shuffleLoc = status.mapShuffleLocations.getLocationForBlock(part)
-              splitsByAddress.getOrElseUpdate(Option.apply(shuffleLoc), ListBuffer()) +=
+              val shuffleLocations = status.mapShuffleLocations.getLocationsForBlock(part)
+              splitsByAddress.getOrElseUpdate(shuffleLocations.asScala, ListBuffer()) +=
                 ((ShuffleBlockId(shuffleId, mapId, part), size))
             }
           }

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -308,6 +308,9 @@ class SparkContext(config: SparkConf) extends SafeLogging {
     _dagScheduler = ds
   }
 
+  private[spark] def shuffleDriverComponents: ShuffleDriverComponents =
+    _shuffleDriverComponents
+
   /**
    * A unique identifier for the Spark application.
    * Its format depends on the scheduler implementation.

diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
@@ -20,6 +20,7 @@ package org.apache.spark
 import java.io.{ObjectInputStream, ObjectOutputStream}
 
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.api.shuffle.ShuffleLocation
 import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler.AccumulableInfo
 import org.apache.spark.storage.BlockManagerId
@@ -81,14 +82,14 @@ case object Resubmitted extends TaskFailedReason {
  */
 @DeveloperApi
 case class FetchFailed(
-    bmAddress: BlockManagerId,  // Note that bmAddress can be null
+    shuffleLocation: Seq[ShuffleLocation],  // Note that shuffleLocation cannot be null
     shuffleId: Int,
     mapId: Int,
     reduceId: Int,
     message: String)
   extends TaskFailedReason {
   override def toErrorString: String = {
-    val bmAddressString = if (bmAddress == null) "null" else bmAddress.toString
+    val bmAddressString = if (shuffleLocation == null) "null" else shuffleLocation.toString
     s"FetchFailed($bmAddressString, shuffleId=$shuffleId, mapId=$mapId, reduceId=$reduceId, " +
       s"message=\n$message\n)"
   }