In [9]:
import prometheus_client 
from prometheus_client import start_http_server, Summary, Histogram, Info
from prometheus_client.parser import text_string_to_metric_families 
from prometheus_client.core import GaugeMetricFamily, CounterMetricFamily, REGISTRY
from prometheus_client import GC_COLLECTOR, PLATFORM_COLLECTOR, PROCESS_COLLECTOR
import random
import time
import docker_gpu_exporter

In [10]:
REGISTRY.unregister(GC_COLLECTOR)
REGISTRY.unregister(PLATFORM_COLLECTOR)
REGISTRY.unregister(PROCESS_COLLECTOR)

In [5]:
class CustomCollector(object):
    
    def __init__(self):
        # Run bash script
        self.results_dict = {"docker_container_running_gpu_pid": 0,
                "docker_container_name": "",
                "docker_container_used_gpu_id": 0,
                "docker_container_utilization_gpu_percent": 0,
                "docker_container_gpu_memory_used_MiB": 0,
                "docker_container_total_gpu_used": 0,
               }
        self.runnning_process = """PID: 423900
                            CONTAINER_NAME: ocr_containter_g03
                            GPU util: 0 423900 96 44 3 423900 92 44
                            GPU usage: 35383 MiB 35071 MiB


                            PID: 1572603
                            CONTAINER_NAME: kafka_spaces_asr_diarization_container
                            GPU util: 0 1572603 - - 1 1572603 - -
                            GPU usage: 2307 MiB 7963 MiB


                            PID: 377944
                            CONTAINER_NAME: asr_sp_arsa_dev_faris_v3
                            GPU util: 1 377944 - -
                            GPU usage: 8001 MiB


                            PID: 2567679
                            CONTAINER_NAME: nemo_faris3
                            GPU util: 1 2567679 0 0
                            GPU usage: 2771 MiB


                            PID: 641061
                            CONTAINER_NAME: mfdr_containter_g3
                            GPU util: 3 641061 - -
                            GPU usage: 2193 MiB



                            """
    def run_bash_script(self):
        return docker_gpu_exporter.get_running_process()
    
        
    def split_list(self, list_a, chunk_size):
        segmented_list = []
        for i in range(0, len(list_a), chunk_size):
            segmented_list.append(list_a[i:i + chunk_size])
        return segmented_list

    def parse_bash_results(self, runnning_process):
        for idx, container in enumerate(runnning_process.split("\n\n")):
            if (not (container)) or (len("".join(container.split(" ")))==0):
                continue
                
            container_gpu_pid = container.split('PID: ')[1].split("\n")[0]
            container_name = container.split('CONTAINER_NAME: ')[1].split("\n")[0]
            container_gpu_util = container.split('GPU util: ')[1].split("\n")[0].split(' ')
            container_gpu_usage = container.split('GPU usage: ')[1].split("\n")[0].split(' ')

            if len(container_gpu_util) > 4:
                container_gpu_util = self.split_list(container_gpu_util, 4)
                container_gpu_usage = self.split_list(container_gpu_usage, 2)

                container_gpu_ids = list(list(zip(*container_gpu_util))[0])
                container_util_per_gpu = list(list(zip(*container_gpu_util))[3])
                container_usage_per_gpu = list(list(zip(*container_gpu_usage))[0])
                docker_container_total_gpu_used = len(container_gpu_util)
            else:
                container_gpu_ids = [container_gpu_util[0]]
                container_util_per_gpu = [container_gpu_util[3]]
                container_usage_per_gpu = [container_gpu_usage[0]]
                docker_container_total_gpu_used = len(container_gpu_util)//4

                
            multi_gpu_result_list = []
            for gpu_id, gpu_util, gpu_usage in zip(container_gpu_ids, container_util_per_gpu, container_usage_per_gpu):
                print("container name: ", container_name)
                metrics_resutls = self.results_dict.copy()
                metrics_resutls["docker_container_running_gpu_pid"] = container_gpu_pid
                metrics_resutls["docker_container_name"] = container_name

                metrics_resutls["docker_container_used_gpu_id"] = gpu_id
                metrics_resutls["docker_container_utilization_gpu_percent"] = "0" if gpu_util=="-" else gpu_util
                metrics_resutls["docker_container_gpu_memory_used_MiB"] = gpu_usage
                metrics_resutls["docker_container_total_gpu_used"] = str(docker_container_total_gpu_used)
                multi_gpu_result_list.append(metrics_resutls)
#                 print(multi_gpu_result_list)
        return multi_gpu_result_list


    def collect(self):
        labels=["container_name", "gpu"]
        
        results_dict_list = self.parse_bash_results(self.runnning_process)
        
#         result_dict = next(self.one_smaple_result_dict())
        for result_dict in results_dict_list:
            container_name = str(result_dict["docker_container_name"])
            gpu_id = str(result_dict["docker_container_used_gpu_id"])

            gauge_pid = GaugeMetricFamily('docker_container_running_gpu_pid', 'What pid is the gpu container', labels=labels)
            gauge_pid.add_metric([container_name, gpu_id], value=result_dict['docker_container_running_gpu_pid'])
            yield gauge_pid

            gauge_name =  GaugeMetricFamily('docker_container_name', 'Container name', labels=labels)
            gauge_name.add_metric([container_name, gpu_id], value=1)
            yield gauge_name

            gauge_gpu_id =  GaugeMetricFamily('docker_container_used_gpu_id', 'Container used gpu', labels=labels)
            gauge_name.add_metric([container_name, gpu_id], value=result_dict['docker_container_used_gpu_id'])
            yield gauge_name

            gauge_util = GaugeMetricFamily('docker_container_utilization_gpu_percent', 'Help text', labels=labels)
            gauge_util.add_metric([container_name, gpu_id], value=result_dict['docker_container_utilization_gpu_percent'])
            yield gauge_util

            gauge_usage = GaugeMetricFamily('docker_container_gpu_memory_used_MiB', 'Help text', labels=labels)
            gauge_usage.add_metric([container_name, gpu_id], value=result_dict['docker_container_gpu_memory_used_MiB'])
            yield gauge_usage

            counter_gpu = CounterMetricFamily('docker_container_total_gpu_used', 'Help text', labels=labels)
            counter_gpu.add_metric([container_name, gpu_id], value=result_dict['docker_container_total_gpu_used'])
            yield counter_gpu
REGISTRY.register(CustomCollector())

container name:  ocr_containter_g03
container name:  ocr_containter_g03
container name:  kafka_spaces_asr_diarization_container
container name:  kafka_spaces_asr_diarization_container
container name:  asr_sp_arsa_dev_faris_v3
container name:  nemo_faris3
container name:  mfdr_containter_g3


In [None]:
for family in text_string_to_metric_families():
    for sample in family.samples:
        print("{0}{1} {2}".format(*sample))

In [None]:
if __name__ == "__main__":
    port = 10046
    start_http_server(port)
#     REGISTRY.register(CustomCollector())
    while True:
#         # period between collection
        time.sleep(10)
#         break


In [None]:
g.name

In [None]:
{"docker_container_running_gpu_pid": 0,
                "docker_container_name": "",
                "docker_container_used_gpu_id": 0,
                "docker_container_utilization_gpu_percent": 0,
                "docker_container_gpu_memory_used_MiB": 0,
                "docker_container_total_gpu_used": 0,
               }

In [None]:
REGISTRY.register(CustomCollector())

In [None]:
# Create a metric to track time spent and requests made.

REQUEST_TIME = prometheus_client.generate_latest() #Summary('Nawaf', 'test')

# Decorate function with metric.
@REQUEST_TIME.time()
def process_request(t):
    """A dummy function that takes some time."""
    time.sleep(t)

if __name__ == '__main__':
    # Start up the server to expose the metrics.
    start_http_server(10055)
    # Generate some requests.
    while True:
        process_request(random.random())
        break

In [None]:
next(REGISTRY.collect())

In [None]:
next(REGISTRY.collect())

In [None]:
Summary('my_gauge', 'Help text')

In [None]:
h = Histogram('request_latency_seconds', 'Description of histogram')
h.observe(4.7, {'trace_id': 'abc123'})

In [None]:
value1 = GaugeMetricFamily("SERVER_STATUS", 'Help text', labels='value')
value1.add_metric(["cpu_usage"], cpu_usage)
yield value1

In [None]:
if __name__ == '__main__':
    start_http_server(10055)         ## port where metrics need to be exposed.
    REGISTRY.register(CustomCollector())
    while True:
        time.sleep(10)		       ## To collect the metrics for every 30s.

In [None]:
import docker_gpu_exporter

In [None]:
h = docker_gpu_exporter.get_runnning_process()

In [None]:
runnning_process = """PID: 423900
CONTAINER_NAME: ocr_containter_g03
GPU util: 0 423900 96 44 3 423900 92 44
GPU usage: 35383 MiB 35071 MiB


PID: 1572603
CONTAINER_NAME: kafka_spaces_asr_diarization_container
GPU util: 0 1572603 - - 1 1572603 - -
GPU usage: 2307 MiB 7963 MiB


PID: 377944
CONTAINER_NAME: asr_sp_arsa_dev_faris_v3
GPU util: 1 377944 - -
GPU usage: 8001 MiB


PID: 2567679
CONTAINER_NAME: nemo_faris3
GPU util: 1 2567679 0 0
GPU usage: 2771 MiB


PID: 641061
CONTAINER_NAME: mfdr_containter_g3
GPU util: 3 641061 - -
GPU usage: 2193 MiB



"""

In [None]:
def split_list(list_a, chunk_size):
    segmented_list = []
    for i in range(0, len(list_a), chunk_size):
        segmented_list.append(list_a[i:i + chunk_size])
    return segmented_list

In [None]:
results_dict = {"docker_container_running_gpu_pid": 0,
                "docker_container_name": "",
                "docker_container_used_gpu_id": 0,
                "docker_container_utilization_gpu_percent": 0,
                "docker_container_gpu_memory_used_MiB": 0,
                "docker_container_total_gpu_used": 0,
               }

In [None]:
for container in runnning_process.split("\n\n"):
    if not (container):
        continue
    container_gpu_pid = container.split('PID: ')[1].split("\n")[0]
    container_name = container.split('CONTAINER_NAME: ')[1].split("\n")[0]
    container_gpu_util = container.split('GPU util: ')[1].split("\n")[0].split(' ')
    container_gpu_usage = container.split('GPU usage: ')[1].split("\n")[0].split(' ')
    
    multi_gpu = False
    if len(container_gpu_util) > 4:
        container_gpu_util = split_list(container_gpu_util, 4)
        container_gpu_usage = split_list(container_gpu_usage, 2)
        
        container_gpu_ids = list(list(zip(*container_gpu_util))[0])
        container_util_per_gpu = list(list(zip(*container_gpu_util))[3])
        container_usage_per_gpu = list(list(zip(*container_gpu_usage))[0])
        docker_container_total_gpu_used = len(container_gpu_util)
        multi_gpu=True
    else:
        container_gpu_ids = container_gpu_util[0]
        container_util_per_gpu = container_gpu_util[3]
        container_usage_per_gpu = container_gpu_usage[0]
        docker_container_total_gpu_used = len(container_gpu_util)//4
    
    if multi_gpu:
        for gpu_id, gpu_util, gpu_usage in zip(container_gpu_ids, container_util_per_gpu, container_usage_per_gpu):
            metrics_resutls = results_dict.copy()
            metrics_resutls["docker_container_running_gpu_pid"] = gpu_id
            metrics_resutls["docker_container_name"] = container_name

            metrics_resutls["docker_container_used_gpu_id"] = gpu_id
            metrics_resutls["docker_container_utilization_gpu_percent"] = gpu_util
            metrics_resutls["docker_container_gpu_memory_used_MiB"] = gpu_usage
            metrics_resutls["docker_container_total_gpu_used"] = docker_container_total_gpu_used
    break

In [None]:
metrics_resutls

In [None]:
container_gpu_pid

In [None]:
container_name

In [None]:
container_gpu_usage

In [None]:
container_gpu_util

In [None]:
container_util_per_gpu

In [None]:
container_gpu_ids

In [None]:
container_gpu_usage

In [None]:
container_usage_per_gpu

In [None]:
docker_container_total_gpu_used

In [None]:
def one_smaple_result_dict(my_list = [1, 2, 3, 4, 5]):
    
    #bash_script_results = self.run_bash_script()
    for i in my_list:

        yield i

In [None]:
next(one_smaple_result_dict())

In [None]:
for i in one_smaple_result_dict():
    print(i)

In [52]:
!docker

/bin/bash: docker: command not found
