Permalink
Browse files

Send core request as ppn to qsub for workspace pilot

  • Loading branch information...
1 parent 1339cf6 commit b45067845af4935dd272fec389ce87bed49c95df @oldpatricka oldpatricka committed Feb 22, 2011
@@ -79,12 +79,14 @@ pbs.submit.path=qsub
pbs.delete.path=qdel
-# Processors per node, right now this should be set to be the maximum processors
-# on each cluster node. If it set too high, pilot job submissions will fail.
-# If it is set too low, the pilot may end up not being the only LRM job on the
-# node at a time and that is unpredictable/unsupported right now.
-
-pbs.ppn=2
+# Processors per node. If this is set to 0, your pilot job will request
+# as many processors as are requested for a VM. For example, if a user requests
+# a 2 core VM, ppn will be set to 2.
+#
+# On some installations, you may wish to hardcode this to a specific value
+# to ensure that each pilot job reserves a whole node for a VM. In this case,
+# choose a non-zero value.
+pbs.ppn=0
# If the pilot job should be submitted to a special queue/server, configure
@@ -91,7 +91,7 @@ public ArrayList constructQsub(String destination,
throw new WorkspaceException(err);
}
- if (ppn < 1) {
+ if (ppn < 0) {
final String err = "invalid processors per node " +
"request: " + Integer.toString(ppn);
throw new WorkspaceException(err);
@@ -850,6 +850,7 @@ protected Reservation scheduleImpl(VirtualMachine vm,
}
final int memory = dep.getIndividualPhysicalMemory();
+ final int cores = dep.getIndividualCPUCount();
final int duration = dep.getMinDuration();
// list of associations should be in the DB, perpetuation of
@@ -860,7 +861,7 @@ protected Reservation scheduleImpl(VirtualMachine vm,
assocs = assocStr.split(",");
}
- return this.scheduler.schedule(memory, duration, assocs, numNodes,
+ return this.scheduler.schedule(memory, cores, duration, assocs, numNodes,
groupid, coschedid, vm.isPreemptable(), callerID);
}
@@ -38,6 +38,7 @@
* @see #proceedCoschedule for handling separate requests together
*
* @param memory MB needed
+ * @param CPU cores needed
* @param duration seconds needed
* @param neededAssociations networks needed
* @param numNodes number needed
@@ -49,6 +50,7 @@
* @throws SchedulingException internal problem
*/
public Reservation schedule(int memory,
+ int cores,
int duration,
String[] neededAssociations,
int numNodes,
@@ -224,6 +224,7 @@ public long getSweeperDelay() {
}
public Reservation schedule(int memory,
+ int cores,
int duration,
String[] neededAssociations,
int numNodes,
@@ -263,7 +264,7 @@ public Reservation schedule(int memory,
this.creationPending.pending(ids);
final NodeRequest req =
- new NodeRequest(ids, memory, duration, assocs, groupid, creatorDN);
+ new NodeRequest(ids, memory, cores, duration, assocs, groupid, creatorDN);
try {
@@ -19,6 +19,7 @@
public class NodeRequest {
private int memory; // MBs
+ private int cores;
private int duration; // seconds
private int[] ids = null;
@@ -41,12 +42,14 @@ public NodeRequest(int memory,
public NodeRequest(int[] ids,
int memory,
+ int cores,
int duration,
String[] neededAssociations,
String groupid,
String creatorDN) {
this(memory, duration);
+ this.cores = cores;
this.ids = ids;
this.neededAssociations = neededAssociations;
this.groupid = groupid;
@@ -80,6 +83,18 @@ public int getNumNodes() {
return this.ids.length;
}
+ public int getCores() {
+ // Java sets ints to 0 if they're never initialized
+ if (this.cores == 0) {
+ return 1;
+ }
+ return this.cores;
+ }
+
+ public void setCores(int cores) {
+ this.cores = cores;
+ }
+
public int getMemory() {
return this.memory;
}
@@ -369,8 +369,8 @@ public synchronized void validate() throws Exception {
"Is the configuration present?");
}
- if (this.ppn < 1) {
- throw new Exception("processors per node (ppn) is less than one, " +
+ if (this.ppn < 0) {
+ throw new Exception("processors per node (ppn) is less than zero, " +
"invalid. Is the configuration present?");
}
@@ -492,6 +492,7 @@ public Reservation reserveSpace(NodeRequest request, boolean preemptable)
this.reserveSpace(request.getIds(),
request.getMemory(),
+ request.getCores(),
request.getDuration(),
request.getGroupid(),
request.getCreatorDN());
@@ -520,6 +521,7 @@ public Reservation reserveCoscheduledSpace(NodeRequest[] requests,
// capacity vs. mapping and we will get more sophisticated here later)
int highestMemory = 0;
+ int highestCores = 0;
int highestDuration = 0;
final ArrayList idInts = new ArrayList(64);
@@ -533,6 +535,12 @@ public Reservation reserveCoscheduledSpace(NodeRequest[] requests,
highestMemory = thisMemory;
}
+ final int thisCores = requests[i].getCores();
+
+ if (highestCores < thisCores) {
+ highestCores = thisCores;
+ }
+
final int thisDuration = requests[i].getDuration();
if (highestDuration < thisDuration) {
@@ -563,7 +571,7 @@ public Reservation reserveCoscheduledSpace(NodeRequest[] requests,
// Assume that the creator's DN is the same for each node
final String creatorDN = requests[0].getCreatorDN();
- this.reserveSpace(all_ids, highestMemory, highestDuration, coschedid, creatorDN);
+ this.reserveSpace(all_ids, highestMemory, highestCores, highestDuration, coschedid, creatorDN);
return new Reservation(all_ids, null, all_durations);
}
@@ -579,6 +587,7 @@ public Reservation reserveCoscheduledSpace(NodeRequest[] requests,
* than one VM is mapped to the same node, the returned node
* assignment array will include duplicates.
* @param memory megabytes needed
+ * @param cores needed
* @param duration seconds needed
* @param uuid group ID, can not be null if vmids is length > 1
* @param creatorDN the DN of the user who requested creation of the VM
@@ -587,6 +596,7 @@ public Reservation reserveCoscheduledSpace(NodeRequest[] requests,
*/
private void reserveSpace(final int[] vmids,
final int memory,
+ final int cores,
final int duration,
final String uuid,
final String creatorDN)
@@ -628,13 +638,14 @@ private void reserveSpace(final int[] vmids,
}
}
- this.reserveSpaceImpl(memory, duration, slotid, vmids, creatorDN);
+ this.reserveSpaceImpl(memory, cores, duration, slotid, vmids, creatorDN);
// pilot reports hostname when it starts running, not returning an
// exception to signal successful best effort pending slot
}
private void reserveSpaceImpl(final int memory,
+ final int cores,
final int duration,
final String uuid,
final int[] vmids,
@@ -646,20 +657,32 @@ private void reserveSpaceImpl(final int memory,
final int dur = duration + this.padding;
final long wallTime = duration + this.padding;
+
+ // If the pbs.ppn option in pilot.conf is 0, we should send
+ // the number of CPU cores used by the VM as the ppn string,
+ // otherwise, use the defined ppn value
+ int ppnRequested;
+ if (this.ppn == 0) {
+ ppnRequested = cores;
+ }
+ else {
+ ppnRequested = this.ppn;
+ }
+
// we know it's torque for now, no casing
final ArrayList torquecmd;
try {
torquecmd = this.torque.constructQsub(this.destination,
memory,
vmids.length,
- this.ppn,
+ ppnRequested,
wallTime,
this.extraProperties,
outputFile,
false,
false,
creatorDN);
-
+
} catch (WorkspaceException e) {
final String msg = "Problem with Torque argument construction";
if (logger.isDebugEnabled()) {
@@ -79,12 +79,14 @@ pbs.submit.path=qsub
pbs.delete.path=qdel
-# Processors per node, right now this should be set to be the maximum processors
-# on each cluster node. If it set too high, pilot job submissions will fail.
-# If it is set too low, the pilot may end up not being the only LRM job on the
-# node at a time and that is unpredictable/unsupported right now.
-
-pbs.ppn=2
+# Processors per node. If this is set to 0, your pilot job will request
+# as many processors as are requested for a VM. For example, if a user requests
+# a 2 core VM, ppn will be set to 2.
+#
+# On some installations, you may wish to hardcode this to a specific value
+# to ensure that each pilot job reserves a whole node for a VM. In this case,
+# choose a non-zero value.
+pbs.ppn=0
# If the pilot job should be submitted to a special queue/server, configure
@@ -79,12 +79,14 @@ pbs.submit.path=qsub
pbs.delete.path=qdel
-# Processors per node, right now this should be set to be the maximum processors
-# on each cluster node. If it set too high, pilot job submissions will fail.
-# If it is set too low, the pilot may end up not being the only LRM job on the
-# node at a time and that is unpredictable/unsupported right now.
-
-pbs.ppn=2
+# Processors per node. If this is set to 0, your pilot job will request
+# as many processors as are requested for a VM. For example, if a user requests
+# a 2 core VM, ppn will be set to 2.
+#
+# On some installations, you may wish to hardcode this to a specific value
+# to ensure that each pilot job reserves a whole node for a VM. In this case,
+# choose a non-zero value.
+pbs.ppn=0
# If the pilot job should be submitted to a special queue/server, configure

0 comments on commit b450678

Please sign in to comment.