Skip to content

Commit

Permalink
Merge branch 'users/amey/arxiv_update' of github.com:microsoft/vidur …
Browse files Browse the repository at this point in the history
…into users/amey/arxiv_update
  • Loading branch information
nitinkedia7 committed May 13, 2024
2 parents e7f122e + d7ee4cf commit cce63d4
Showing 1 changed file with 40 additions and 26 deletions.
66 changes: 40 additions & 26 deletions simulator/profiling/cpu_overhead/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import datetime
import gc
import os
from typing import List, Any
from itertools import product

import pandas as pd
Expand Down Expand Up @@ -59,7 +60,37 @@ def parse_args():
return args


def profile_mo
def profile_model(
model_name: str,
batch_sizes_to_profile: List[int],
tensor_parallel_degrees: List[int],
output_dir: str,
pbar: Any,
) -> dict:
results = []

for tensor_parallel_degree in tensor_parallel_degrees:
for batch_index, batch_size in enumerate(batch_sizes_to_profile):
try:
runner = create_runner(
model_name, batch_size, tensor_parallel_degree, output_dir
)
results.append(ray.get(runner.run.remote()))
del runner
# trigger garbage collection
gc.collect()
except Exception as e:
print(
f"Failed to run {model_name}_{batch_size}_{tensor_parallel_degree} due to {e}"
)
# update progress bar
pbar.update(len(batch_sizes_to_profile) - batch_index)
break

pbar.update(1)

df = pd.DataFrame(results)
df.to_csv(f"{output_dir}/{model_name}/cpu_overhead.csv")


def create_runner(
Expand All @@ -83,7 +114,6 @@ def create_runner(
def main():
args = parse_args()

results = []

batch_sizes_to_profile = get_cpu_overhead_batch_sizes_to_profile(
args.max_batch_size
Expand All @@ -95,30 +125,14 @@ def main():

pbar = tqdm(total=len(list(input_combos)))

for model_name, tensor_parallel_degree in product(
args.models, args.num_tensor_parallel_workers
):
for batch_index, batch_size in enumerate(batch_sizes_to_profile):
try:
runner = create_runner(
model_name, batch_size, tensor_parallel_degree, args.output_dir
)
results.append(ray.get(runner.run.remote()))
del runner
# trigger garbage collection
gc.collect()
except Exception as e:
print(
f"Failed to run {model_name}_{batch_size}_{tensor_parallel_degree} due to {e}"
)
# update progress bar
pbar.update(len(batch_sizes_to_profile) - batch_index)
break

pbar.update(1)

df = pd.DataFrame(results)
df.to_csv(f"{args.output_dir}/cpu_overhead.csv")
for model_name in args.models:
profile_model(
model_name,
batch_sizes_to_profile,
args.num_tensor_parallel_workers,
args.output_dir,
pbar,
)


if __name__ == "__main__":
Expand Down

0 comments on commit cce63d4

Please sign in to comment.