Skip to content

Commit

Permalink
server API, better slurm job control
Browse files Browse the repository at this point in the history
  • Loading branch information
louisponet committed Dec 17, 2018
1 parent 69aa25c commit 0c30e3b
Show file tree
Hide file tree
Showing 6 changed files with 98 additions and 54 deletions.
6 changes: 6 additions & 0 deletions src/API.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,9 @@ export searchinput, searchinputs, setcutoffs!, setname!

#Interacting with the Structure inside DFJob
export atom, atoms, setatoms!, setpseudos!, projections, setprojections!

include("serverAPI.jl")
export qstat, watch_qstat

#Slurm interactions
export slurm_history_jobdir, slurm_jobid, slurm_isrunning
7 changes: 2 additions & 5 deletions src/DFControl.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,18 @@ module DFControl
include("input.jl")
include("utils.jl")
export yesterday, lastweek, lastmonth

include("job.jl")

export DFJob, Exec, DFInput
include("server.jl")
include("API.jl")

include("constants.jl")

include("fileio.jl")
include("plotting.jl")

include("server.jl")
export qstat
export slurm_history_jobdir

include("defaults.jl")
export setdefault_pseudodir
export setdefault_server
Expand Down
3 changes: 3 additions & 0 deletions src/job.jl
Original file line number Diff line number Diff line change
Expand Up @@ -202,3 +202,6 @@ end

"Finds the input corresponding to the name and returns the full output path."
outpath(job::DFJob, n::String) = outpath(input(job,n))

runslocal_assert(job::DFJob) =
@assert runslocal(job) "This only works if the job runs on `localhost`."
18 changes: 0 additions & 18 deletions src/jobAPI.jl
Original file line number Diff line number Diff line change
Expand Up @@ -89,24 +89,6 @@ function setheaderword!(job::DFJob, word::String, new_word::String; print=true)
end
return job
end

function isrunning(job::DFJob)
@assert haskey(job.metadata, :slurmid) error("No slurmid found for job $(job.name)")
cmd = `qstat -f $(job.metadata[:slurmid])`
if runslocal(job)
str = read(cmd, String)
else
str = sshreadstring(job.server, cmd)
end
isempty(str) && return false
splstr = split(str)
for (i,s) in enumerate(splstr)
if s=="job_state"
return any(splstr[i+2] .== ["Q","R"])
end
end
end

function progressreport(job::DFJob; kwargs...)
dat = outputdata(job; kwargs...)
plotdat = SymAnyDict(:fermi=>0.0)
Expand Down
40 changes: 9 additions & 31 deletions src/server.jl
Original file line number Diff line number Diff line change
Expand Up @@ -115,25 +115,17 @@ end

sshcmd(server, cmd) = run(`ssh -t $server $cmd`)
sshreadstring(server, cmd) = read(`ssh -t $server $cmd`, String)
"""
qstat(server)
If sbatch is running on server it will return the output of the `qstat` command.
"""
qstat(server) = server=="localhost" ? run(`qstat`) : sshcmd(server, "qstat")
qstat() = qstat(getdefault_server())

"""
watch_qstat(server)
If sbatch is running on server it will return the output of the `watch qstat` command.
"""
watch_qstat(server) = server=="localhost" ? run(`watch qstat`) : sshcmd(server, "watch qstat")
watch_qstat() = watch_qstat(getdefault_server())

"Deletes the job from the local or server queue."
qdel(server::String, id::Int) = sshcmd(server, "qdel $id")
qdel(id::Int) = run(`qdel $id`)
qdel(job::DFJob) = runslocal(job) ? qdel(job.metadata[:slurmid]) : qdel(job.server, job.metadata[:slurmid])
function qdel(job::DFJob)
if runslocal(job)
qdel(slurm_jobid(job))
else
qdel(job.server, slurm_jobid(job))
end
end

function qsub(job::DFJob)
outstr = ""
Expand Down Expand Up @@ -205,19 +197,5 @@ function push(job::DFJob)
scp("job.tt")
end

"""
slurm_history_jobdir(stardate=yesterday())
Returns the unique job directories of the jobs that ran since the `startdate`.
Startdate should be printed in following format: yyyy-mm-dd.
"""
function slurm_history_jobdir(startdate=yesterday()) #format of startdate = yyyy-mm-dd
history = strip.(reverse(split(read(`sacct --starttime $startdate --format=Workdir%100`, String), "\n")))
output = String[]
for h in history
if h output && ispath(h)
push!(output, h)
end
end
return reverse(output)
end
#Gives the reverse (last job is listed first) of the output, omitting the header lines
slurm_process_command(cmd) = strip.(reverse(readlines(cmd)))[1:end-2]
78 changes: 78 additions & 0 deletions src/serverAPI.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""
qstat(server)
If sbatch is running on server it will return the output of the `qstat` command.
"""
qstat(server) = server=="localhost" ? run(`qstat`) : sshcmd(server, "qstat")
qstat() = qstat(getdefault_server())

"""
watch_qstat(server)
If sbatch is running on server it will return the output of the `watch qstat` command.
"""
watch_qstat(server) = server=="localhost" ? run(`watch qstat`) : sshcmd(server, "watch qstat")
watch_qstat() = watch_qstat(getdefault_server())

#--------------- Slurm Interactions -------------------#
"""
slurm_history_jobdir(stardate=yesterday())
Returns the unique job directories of the jobs that ran since the `startdate`.
Startdate should be printed in following format: yyyy-mm-dd.
"""
function slurm_history_jobdir(startdate=yesterday()) #format of startdate = yyyy-mm-dd
runslocal_assert(job)
history = slurm_process_command(`sacct --starttime $startdate --format=Workdir%100`)
output = String[]
for h in history
if h output && ispath(h)
push!(output, h)
end
end
return reverse(output)
end

"""
slurm_jobid(job::DFJob, startdate=yesterday())
Looks through the jobs since the `startdate` and returns the job ID if found.
Returns -1 if the jobID was not found in the list of jobs since `startdate`.
"""
function slurm_jobid(job::DFJob, startdate=yesterday())
runslocal_assert(job)
if haskey(job.metadata, :slurmid)
return job.metadata[:slurmid]
end
id_dir = split.(slurm_process_command(`sacct --starttime $startdate --format=JobID,Workdir%100`))
id_ = -1
for (id, dir) in id_dir
if dir == job.local_dir
id_ = parse(Int, id)
break
end
end
if id_ == -1
@info "Job in directory $(job.local_dir) was not found in the slurm jobs since
$startdate"
end
return id
end

"""
slurm_isrunning(job::DFJob)
Returns whether the job is running.
"""
function slurm_isrunning(job::DFJob)
runslocal_assert(job)
id = slurm_jobid(job)
if id != -1
if slurm_process_command(`sacct -j $id --format=state`)[1] == "RUNNING"
return true
else
return false
end
else
return false
end
end

0 comments on commit 0c30e3b

Please sign in to comment.