Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add autoscale check for GPU #573

Merged
merged 15 commits into from
Oct 19, 2022
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
21 changes: 21 additions & 0 deletions backend/src/plugins/kube.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ export default fp(async (fastify: FastifyInstance) => {
fastify.log.error(e, 'Failed to retrieve current namespace');
}

let saToken;
try {
saToken = await getSAToken();
} catch (e) {
fastify.log.error(e, 'Failed to retrieve Service Account token');
}

let clusterID;
try {
const clusterVersion = await customObjectsApi.getClusterCustomObject(
Expand Down Expand Up @@ -67,6 +74,7 @@ export default fp(async (fastify: FastifyInstance) => {
clusterID,
clusterBranding,
rbac,
saToken,
});

// Initialize the watching of resources
Expand All @@ -93,3 +101,16 @@ const getCurrentNamespace = async () => {
}
});
};

const getSAToken = async () => {
return new Promise<string>((resolve, reject) => {
if (currentContext === 'inClusterContext') {
fs.readFile('/var/run/secrets/kubernetes.io/serviceaccount/token', (err, data) => {
if (err) {
reject(err);
}
resolve(String(data));
});
}
});
};
117 changes: 94 additions & 23 deletions backend/src/routes/api/gpu/gpuUtils.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,26 @@
import { KubeFastifyInstance, PrometheusResponse } from '../../../types';
import {
MachineAutoscalerList,
GPUInfo,
KubeFastifyInstance,
PrometheusResponse,
MachineSet,
gpuScale,
} from '../../../types';
import { V1PodList } from '@kubernetes/client-node';
import https from 'https';
import * as fs from 'fs';

export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise<[boolean, number]> => {
/** Storage to prevent heavy calls from being performed for EVERY user */
const storage: { lastFetch: number; lastValue: GPUInfo } = {
lastValue: { available: 0, configured: false, autoscalers: [] },
lastFetch: 0,
};

export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise<GPUInfo> => {
andrewballantyne marked this conversation as resolved.
Show resolved Hide resolved
if (storage.lastFetch >= Date.now() - 30_000) {
fastify.log.info(`Returning cached gpu value (${JSON.stringify(storage)})`);
return storage.lastValue;
}
fastify.log.info(`Computing GPU state`);
let maxGpuNumber = 0;
let areGpusConfigured = false;
const gpuPodList = await fastify.kube.coreV1Api
cfchase marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -15,38 +32,45 @@ export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise<[boole
fastify.log.error(`Exception when calling DCGM exporter pods: ${e}`);
return { items: [] } as V1PodList;
});
const scalingLimit = await getGPUScaling(fastify);
if (gpuPodList.items.length != 0) {
areGpusConfigured = true;
const token = await new Promise<string>((resolve, reject) => {
fs.readFile('/var/run/secrets/kubernetes.io/serviceaccount/token', (err, data) => {
try {
resolve(String(data));
} catch {
reject('');
fastify.log.error(err);
}
});
});
const gpuDataResponses = [];
for (let i = 0; i < gpuPodList.items.length; i++) {
const data = await getGPUData(gpuPodList.items[i].status.podIP, token);
if (data.code === 200) {
const gpuNumber = data.response;
if (gpuNumber > maxGpuNumber) {
maxGpuNumber = gpuNumber;
gpuDataResponses.push(getGPUData(gpuPodList.items[i].status.podIP, fastify.kube.saToken));
}

await Promise.all(gpuDataResponses).then((gpuDataList) => {
for (let i = 0; i < gpuDataList.length; i++) {
if (gpuDataList[i].code === 200) {
const gpuNumber = gpuDataList[i].response;
if (gpuNumber > maxGpuNumber) {
maxGpuNumber = gpuNumber;
}
} else {
fastify.log.warn(`Error getting GPUData ${gpuDataList[i].response}`);
}
} else {
fastify.log.warn(`Error getting GPUData ${data.response}`);
}
}
});
} else if (scalingLimit.length != 0) {
areGpusConfigured = true;
}
return [areGpusConfigured, maxGpuNumber];

const data: GPUInfo = {
configured: areGpusConfigured,
available: maxGpuNumber,
autoscalers: scalingLimit,
};
storage.lastFetch = Date.now();
storage.lastValue = data;
return data;
};

export const getGPUData = async (
podIP: string,
token: string,
): Promise<{ code: number; response: number | any }> => {
return await new Promise((resolve, reject) => {
return new Promise((resolve, reject) => {
const options = {
hostname: 'thanos-querier.openshift-monitoring.svc.cluster.local',
port: 9091,
Expand Down Expand Up @@ -79,3 +103,50 @@ export const getGPUData = async (
httpsRequest.end();
});
};

const getGPUScaling = async (fastify: KubeFastifyInstance): Promise<gpuScale[]> => {
const scalingList: gpuScale[] = [];
const autoscalerList = (
await fastify.kube.customObjectsApi.listNamespacedCustomObject(
'autoscaling.openshift.io',
'v1beta1',
'openshift-machine-api',
'machineautoscalers',
)
).body as MachineAutoscalerList;

const machineSets = [];
for (let i = 0; i < autoscalerList.items.length; i++) {
const machineSetName = autoscalerList.items[i].spec.scaleTargetRef.name; //also gives info about kind and apiversion if needed in the future
machineSets.push(
fastify.kube.customObjectsApi
.getNamespacedCustomObject(
'machine.openshift.io',
'v1beta1',
'openshift-machine-api',
'machinesets',
machineSetName,
)
.catch((e) => {
fastify.log.warn(
`Autoscaler ${autoscalerList.items[i].metadata.name} did not contain MachineSet info. ${e.response.data.message}`,
);
return null;
}),
);
}
await Promise.all(machineSets).then((msList) => {
for (let i = 0; i < msList.length; i++) {
const machineSet = msList[i].body as MachineSet;
const gpuAmount = Number(machineSet?.metadata.annotations?.['machine.openshift.io/GPU']);
if (gpuAmount > 0) {
scalingList.push({
availableScale:
autoscalerList.items[i].spec.maxReplicas - (machineSet.status.availableReplicas || 0),
gpuNumber: gpuAmount,
});
}
}
});
return scalingList;
};
4 changes: 3 additions & 1 deletion backend/src/routes/api/status/statusUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ export const status = async (
request: FastifyRequest,
): Promise<{ kube: KubeStatus }> => {
const kubeContext = fastify.kube.currentContext;
const { currentContext, namespace, currentUser, clusterID, clusterBranding } = fastify.kube;
const { currentContext, namespace, currentUser, clusterID, clusterBranding, saToken } =
fastify.kube;

const userName = await getUserName(fastify, request);
const isAdmin = await isUserAdmin(fastify, userName, namespace);
Expand All @@ -33,6 +34,7 @@ export const status = async (
clusterBranding,
isAdmin,
isAllowed,
saToken,
},
};
}
Expand Down
48 changes: 48 additions & 0 deletions backend/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ export type KubeStatus = {
clusterBranding: string;
isAdmin: boolean;
isAllowed: boolean;
saToken: string;
};

export type KubeDecorator = KubeStatus & {
Expand Down Expand Up @@ -660,3 +661,50 @@ type GroupCustomObjectItemMetadata = {
export type RecursivePartial<T> = {
[P in keyof T]?: RecursivePartial<T[P]>;
};

export type GPUScaleType = {
type: 'nvidia.com/gpu' | 'amd.com/gpu';
min: number;
max: number;
};

export type MachineAutoscaler = {
spec: {
maxReplicas: number;
minReplicas: number;
scaleTargetRef: {
apiversion: string;
kind: string;
name: string;
};
};
} & K8sResourceCommon;

export type MachineSet = {
status: {
availableReplicas: number;
fullyLabeledReplicas: number;
observedGeneration: number;
readyReplicas: number;
replicas: number;
};
} & K8sResourceCommon;

export type MachineAutoscalerList = {
items: MachineAutoscaler[];
} & K8sResourceCommon;

export type MachineSetList = {
items: MachineSet[];
} & K8sResourceCommon;

export type gpuScale = {
availableScale: number;
gpuNumber: number;
};

export type GPUInfo = {
configured: boolean;
available: number;
autoscalers: gpuScale[];
};
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,23 @@ const GPUSelectField: React.FC<GPUSelectFieldProps> = ({ value, setValue }) => {
let lastCall = 0;
let cancelled = false;
const fetchGPU = () => {
setFetching(true);
lastCall = Date.now();
return getGPU().then(([areGpusAvailable, size]) => {
return getGPU().then((gpuInfo) => {
if (cancelled) return;
setAreGpusAvailable(areGpusAvailable);
setGpuSize(size || 0);
setGpuSize(gpuInfo.available || 0);
setAreGpusAvailable(gpuInfo.configured);
setFetching(false);
let availableScaleableGPU = 0;
if (gpuInfo.autoscalers) {
availableScaleableGPU = gpuInfo.autoscalers.reduce(
(highestValue, { availableScale, gpuNumber }) =>
availableScale > 0 ? Math.max(highestValue, gpuNumber) : highestValue,
0,
);
}
if (gpuInfo.available < availableScaleableGPU) {
setGpuSize(availableScaleableGPU);
}
});
};

Expand Down
3 changes: 2 additions & 1 deletion frontend/src/services/gpuService.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import axios from 'axios';
import { GPUInfo } from 'types';

export const getGPU = (): Promise<[boolean, number]> => {
export const getGPU = (): Promise<GPUInfo> => {
const url = '/api/gpu';
return axios
.get(url)
Expand Down
11 changes: 11 additions & 0 deletions frontend/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -633,3 +633,14 @@ export enum EventStatus {
}

export type UsernameMap<V> = { [username: string]: V };

export type gpuScale = {
availableScale: number;
gpuNumber: number;
};

export type GPUInfo = {
configured: boolean;
available: number;
autoscalers: gpuScale[];
};
9 changes: 9 additions & 0 deletions manifests/base/cluster-role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@ apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: odh-dashboard
rules:
- verbs:
- get
- list
apiGroups:
- machine.openshift.io
- autoscaling.openshift.io
resources:
- machineautoscalers
- machinesets
- verbs:
- get
- watch
Expand Down