diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index 179461406fdd..84be20129d6b 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -281,7 +281,7 @@ def test( local_rank = int(os.getenv("LOCAL_RANK", "0")) # Load the model on CPU first to avoid OOM for large models @fp32 - pipe = pipeline(task, model=model, device=-1, framework="pt") + pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt") if dtype == torch.half: pipe.model.half() @@ -362,7 +362,7 @@ def test( # We have to load these large models on CPU with pipeline because not # enough GPU memory - pipe = pipeline(task, model=model, device=-1, framework="pt") + pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt") bs_output = pipe(query, **inf_kwargs) pipe.model = deepspeed.init_inference(pipe.model, @@ -425,7 +425,7 @@ def test( # We have to load these large models on CPU with pipeline because not # enough GPU memory - pipe = pipeline(task, model=model, device=-1, framework="pt") + pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt") bs_output = pipe(query, **inf_kwargs) pipe.model = deepspeed.init_inference(pipe.model, @@ -476,7 +476,7 @@ def test( # We have to load these large models on CPU with pipeline because not # enough GPU memory - pipe = pipeline(task, model=model, device=-1, framework="pt") + pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt") bs_output = pipe(query, **inf_kwargs) pipe.model = deepspeed.init_inference(pipe.model,