Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add memory free and per process memory usage to nvidia_smi #5796

Merged
merged 6 commits into from Apr 5, 2019
Merged
Diff settings

Always

Just for now

@@ -26,21 +26,23 @@
GPU_UTIL = 'gpu_utilization'
MEM_UTIL = 'mem_utilization'
ENCODER_UTIL = 'encoder_utilization'
MEM_ALLOCATED = 'mem_allocated'
MEM_USAGE = 'mem_usage'
TEMPERATURE = 'temperature'
CLOCKS = 'clocks'
POWER = 'power'
PROCESSES_MEM = 'processes_mem'

ORDER = [
PCI_BANDWIDTH,
FAN_SPEED,
GPU_UTIL,
MEM_UTIL,
ENCODER_UTIL,
MEM_ALLOCATED,
MEM_USAGE,
TEMPERATURE,
CLOCKS,
POWER,
PROCESSES_MEM,
]


@@ -80,10 +82,11 @@ def gpu_charts(gpu):
['decoder_util', 'decoder'],
]
},
MEM_ALLOCATED: {
'options': [None, 'Memory Allocated', 'MiB', fam, 'nvidia_smi.memory_allocated', 'line'],
MEM_USAGE: {
'options': [None, 'Memory Usage', 'MiB', fam, 'nvidia_smi.memory_allocated', 'stacked'],
'lines': [
['fb_memory_usage', 'used'],
['fb_memory_free', 'free'],
['fb_memory_used', 'used'],
]
},
TEMPERATURE: {
@@ -107,6 +110,10 @@ def gpu_charts(gpu):
['power_draw', 'power', 1, 100],
]
},
PROCESSES_MEM: {
'options': [None, 'Memory Used by Each Process', 'MiB', fam, 'nvidia_smi.processes_mem', 'stacked'],
'lines': []
},
}

idx = gpu.num
@@ -260,9 +267,13 @@ def decoder_util(self):
return self.root.find('utilization').find('decoder_util').text.split()[0]

@handle_attr_error
def fb_memory_usage(self):
def fb_memory_used(self):
return self.root.find('fb_memory_usage').find('used').text.split()[0]

@handle_attr_error
def fb_memory_free(self):
return self.root.find('fb_memory_usage').find('free').text.split()[0]

@handle_attr_error
def temperature(self):
return self.root.find('temperature').find('gpu_temp').text.split()[0]
@@ -288,6 +299,18 @@ def mem_clock(self):
def power_draw(self):
return float(self.root.find('power_readings').find('power_draw').text.split()[0]) * 100

@handle_attr_error
def processes(self):
p_nodes = self.root.find('processes').findall('process_info')
ps = []
for p in p_nodes:
ps.append({
'pid': p.find('pid').text,
'process_name': p.find('process_name').text,
'used_memory': int(p.find('used_memory').text.split()[0]),
})
return ps

def data(self):
data = {
'rx_util': self.rx_util(),
@@ -297,20 +320,22 @@ def data(self):
'memory_util': self.memory_util(),
'encoder_util': self.encoder_util(),
'decoder_util': self.decoder_util(),
'fb_memory_usage': self.fb_memory_usage(),
'fb_memory_used': self.fb_memory_used(),
'fb_memory_free': self.fb_memory_free(),
'gpu_temp': self.temperature(),
'graphics_clock': self.graphics_clock(),
'video_clock': self.video_clock(),
'sm_clock': self.sm_clock(),
'mem_clock': self.mem_clock(),
'power_draw': self.power_draw(),
}
processes = self.processes() or []
data.update({'process_mem_{0}'.format(p['pid']): p['used_memory'] for p in processes})

return dict(
('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items() if v is not None and v != BAD_VALUE
)


class Service(SimpleService):
def __init__(self, configuration=None, name=None):
super(Service, self).__init__(configuration=configuration, name=name)
@@ -320,22 +345,44 @@ def __init__(self, configuration=None, name=None):
self.poller = NvidiaSMIPoller(poll)

def get_data(self):
if not self.poller.is_started():
self.poller.start()

if not self.poller.is_alive():
self.debug('poller is off')
return None

last_data = self.poller.data()
if not last_data:
return None

parsed = self.parse_xml(last_data)
if parsed is None:
return None

data = dict()
for idx, root in enumerate(parsed.findall('gpu')):
data.update(GPU(idx, root).data())
gpu = GPU(idx, root)
data.update(gpu.data())
self.update_processes_mem_chart(gpu)

return data or None

def update_processes_mem_chart(self, gpu):
ps = gpu.processes()
if not ps:
return
chart = self.charts['gpu{0}_{1}'.format(gpu.num, PROCESSES_MEM)]
active_dim_ids = []
for p in ps:
dim_id = 'gpu{0}_process_mem_{1}'.format(gpu.num, p['pid'])
active_dim_ids.append(dim_id)
if dim_id not in chart:
chart.add_dimension([dim_id, '{0} {1}'.format(p['pid'], p['process_name'])])
for dim in chart:
if dim.id not in active_dim_ids:
chart.del_dimension(dim.id, hide=False)

def check(self):
if not self.poller.has_smi():
self.error("couldn't find '{0}' binary".format(NVIDIA_SMI))
@@ -355,15 +402,14 @@ def check(self):
return False

self.create_charts(gpus)
self.poller.start()

return True

def parse_xml(self, data):
try:
return et.fromstring(data)
except et.ParseError as error:
self.error(error)
self.error('xml parse failed: "{0}", error: {1}'.format(data, error))

return None

@@ -200,12 +200,13 @@ def add_dimension(self, dimension):
self.dimensions.append(dim)
return dim

def del_dimension(self, dimension_id):
def del_dimension(self, dimension_id, hide=True):
if dimension_id not in self:
return
idx = self.dimensions.index(dimension_id)
dimension = self.dimensions[idx]
dimension.params['hidden'] = 'hidden'
if hide:
dimension.params['hidden'] = 'hidden'
dimension.params['obsolete'] = 'obsolete'
self.create()
self.dimensions.remove(dimension)
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.