diff --git a/README.md b/README.md index 6334282..b8ea758 100644 --- a/README.md +++ b/README.md @@ -182,7 +182,7 @@ Using the fine-tuned model to generate new text can be done using the `generate_ The arguments for the `generate_text.py` script are as follows: ``` -usage: generate_text.py [-h] --model_config MODEL_CONFIG [--benchmark_mode] [--benchmark_seq_length BENCHMARK_SEQ_LENGTH] +usage: generate_text.py [-h] --model_config MODEL_CONFIG [--benchmark_mode] [--benchmark_seq_length BENCHMARK_SEQ_LENGTH] [--device cpu] optional arguments: -h, --help show this help message and exit @@ -191,6 +191,8 @@ optional arguments: --benchmark_mode use intel pytorch extension to optimize model. --benchmark_seq_length BENCHMARK_SEQ_LENGTH length of generation if benchmark mode is used. + --device cpu + Choose run on cpu or xpu, default is cpu. ``` **Configuration Parameters** @@ -242,14 +244,17 @@ Within the yaml configuration file, the following optional arguments can be spec **YAML file** | **Environment Name** | **Configuration** | | :---: | :---: | :---: | -`env/intel/text-intel-torch.yml` | `text-intel-torch` | Python=3.9.7, PyTorch v1.13, Intel® Extension for PyTorch v1.13, Intel® Neural Compressor v2.0.0 | +`env/intel/text-intel-torch-cpu.yml` | `text-intel-torch` | Python=3.9.7, PyTorch v1.13, Intel® Extension for PyTorch v1.13, Intel® Neural Compressor v2.0.0 | +`env/intel/text-intel-torch-xpu.yml` | `text-intel-torch` | Python=3.9.7, PyTorch v1.13, Intel® Extension for PyTorch v1.13+xpu, Intel® Neural Compressor v2.0.0 | ### Optimized Solution Setup -Follow the below conda installation commands to setup the Intel® oneAPI optimized PyTorch environment for model training and text generation. +Follow the below conda installation commands to setup the Intel® oneAPI optimized PyTorch environment for model training and text generation. Please choose `env/intel/text-intel-torch-xpu.yml` if you have Intel GPU available. ```sh -conda env create -f env/intel/text-intel-torch.yml +conda env create -f env/intel/text-intel-torch-cpu.yml # For CPU +or +conda env create -f env/intel/text-intel-torch-xpu.yml # For XPU ``` *Activate stock conda environment* @@ -257,12 +262,17 @@ Use the following command to activate the environment that was created: ```sh conda activate text-intel-torch ``` +Please perform this additional installation step only if you are using an Intel GPU, CPU users can skip this step: +```sh +pip install -r env/intel/update_to_torch_xpu.txt +``` -This script utilizes the dependencies found in the `env/intel/text-intel-torch.yml` file to create an environment as follows: +This script utilizes the dependencies found in the `env/intel/text-intel-torch-cpu.yml` or `env/intel/text-intel-torch-xpu.yml` file to create an environment as follows: | **YAML file** | **Environment Name** | **Configuration** | | :-------------------: | :------------------: | :-----------------------------: | -| `env/intel/text-intel-torch.yml` | `text-intel-torch` | Python=3.9.x with PyTorch v1.13, Intel® Extension For PyTorch v 1.13.0, Intel® Neural Compressor v2.0.0 | +| `env/intel/text-intel-torch-cpu.yml` | `text-intel-torch` | Python=3.9.x with PyTorch v1.13, Intel® Extension For PyTorch v 1.13.0, Intel® Neural Compressor v2.0.0 | +| `env/intel/text-intel-torch-xpu.yml, update_to_torch_xpu.txt` | `text-intel-torch` | Python=3.9.x with PyTorch v1.13, Intel® Extension For PyTorch v 1.13.0+xpu, Intel® Neural Compressor v2.0.0 | ### Intel® oneAPI Optimized Implementation @@ -282,6 +292,33 @@ The command to fine-tune the model with Intel® optimizations enabled is: ipexrun --use_logical_core --enable_tcmalloc src/finetune_model.py --model_config configs/config_base.yml --data_path data/abcnews-date-text.csv --save_path saved_models/gpt2-medium-finetuned-intel --intel ``` +**For Intel GPU training**, device 'xpu' must be added to 'python/site-packages/transformers/training_args.py' as shown in the below diff at lines. +It can be changed manually follows: +```python +diff --git a/training_args.py b/training_args.py +index 1a90710..9421ccd 100644 +--- a/training_args.py ++++ b/training_args.py +@@ -1466,6 +1466,8 @@ class TrainingArguments: + torch.distributed.init_process_group( + backend=self.xpu_backend, rank=rank, world_size=size, timeout=self.ddp_timeout_delta + ) ++ elif torch.xpu.is_available(): ++ device = torch.device("xpu") + elif is_torch_tpu_available(): + device = xm.xla_device() + self._n_gpu = 0 +``` +Or to apply the patch automatically, please run the following Python script only once. +```bash +$ cd src +$ python ./apply_xpu_patch.py +``` +and then run the fine tuning step as follows: +```sh +ipexrun --use_logical_core --enable_tcmalloc src/finetune_model.py --model_config configs/config_base.yml --data_path data/abcnews-date-text.csv --save_path saved_models/gpt2-medium-finetuned-intel --intel +``` + **Expected Output**
The output-trained model will be saved as a `huggingface` pre-trained model saved to the 'saved_models/gpt2-medium-finetuned-intel'. Training time in seconds would be generated at the end of the training module. @@ -308,6 +345,11 @@ As above, this trained model can be used to generate text using the provided `ge python src/generate_text.py --model_config configs/config_finetuned_intel.yml ``` +To run inference on an Intel GPU, please run the following +```sh +python src/generate_text.py --model_config configs/config_finetuned_inc.yml --device xpu +``` + **Expected Output**
The converted ONNX fine-tuned model will be saved to `saved_models/gpt2-medium-finetuned-intel-onnx`. @@ -348,6 +390,11 @@ Once the quantized model is created, we can use the `generate_text.py` script on python src/generate_text.py --model_config configs/config_finetuned_inc.yml ``` +To run inference on an Intel GPU, please run the following +```sh +python src/generate_text.py --model_config configs/config_finetuned_inc.yml --device xpu +``` + ## Performance Observations In the following, we report some results comparing Intel® technologies vs the stock alternative on the task of generating new texts of various lengths. diff --git a/env/intel/text-intel-torch.yml b/env/intel/text-intel-torch-cpu.yml similarity index 100% rename from env/intel/text-intel-torch.yml rename to env/intel/text-intel-torch-cpu.yml diff --git a/env/intel/text-intel-torch-xpu.yml b/env/intel/text-intel-torch-xpu.yml new file mode 100644 index 0000000..ce5ad9e --- /dev/null +++ b/env/intel/text-intel-torch-xpu.yml @@ -0,0 +1,15 @@ +# Refer to https://intel.github.io/intel-extension-for-pytorch/xpu/1.13.10+xpu/tutorials/getting_started.html +# for intel_extension_for_pytorch installation. +name: text-intel-torch +channels: + - conda-forge +dependencies: + - python=3.9.7 + - pip=21.2.4 + - pip: + - neural-compressor==2.0 + - pandas==1.5.3 + - numpy==1.23.5 + - transformers==4.26.1 + - datasets==2.10.1 + - optimum[onnxruntime]==1.6.4 diff --git a/env/intel/update_to_torch_xpu.txt b/env/intel/update_to_torch_xpu.txt new file mode 100644 index 0000000..459ee5b --- /dev/null +++ b/env/intel/update_to_torch_xpu.txt @@ -0,0 +1,3 @@ +-i https://developer.intel.com/ipex-whl-stable-xpu +torch==1.13.0a0 +intel_extension_for_pytorch==1.13.10+xpu diff --git a/src/apply_xpu_patch.py b/src/apply_xpu_patch.py new file mode 100644 index 0000000..4cd0478 --- /dev/null +++ b/src/apply_xpu_patch.py @@ -0,0 +1,12 @@ +import transformers +import os +import subprocess + +module_path = transformers.__path__[0] +# get path for training_args.py +target_file_path = os.path.join(module_path, "training_args.py") + +# apply patch to training_args.py file in the transformers package +subprocess.run(["patch", target_file_path, "transformers_xpu.patch"]) + +print("patch applied successfully") diff --git a/src/generate_text.py b/src/generate_text.py index c0bd13e..64c3ae3 100644 --- a/src/generate_text.py +++ b/src/generate_text.py @@ -26,7 +26,7 @@ AutoModelForCausalLM, set_seed ) - +import intel_extension_for_pytorch as ipex def generate_text( tokenizer: PreTrainedTokenizer, @@ -36,7 +36,8 @@ def generate_text( min_length: int = 0, max_length: int = 10, algorithm: str = 'greedy', - stop_token: str = '.') -> Tuple[List[int],float]: + stop_token: str = '.', + device: str = 'cpu') -> Tuple[List[int],float]: """Generate text using the provided model and algorithm. Args: @@ -60,18 +61,26 @@ def generate_text( Tuple(List[int], float): generated sequence of token_ids, total time for of model inference calls """ - all_token_ids = input_ids.clone() all_attention_masks = attention_mask.clone() eos_token_id = tokenizer([stop_token], return_tensors='np')[ 'input_ids'][0][0] has_eos = torch.zeros(1, dtype=torch.bool) + ones_mask = torch.ones([1, 1]) + + if device == 'xpu': + print("#### xpu") + all_token_ids = input_ids.to("xpu") + all_attention_masks = attention_mask.to("xpu") + eos_token_id = torch.tensor([eos_token_id], device=torch.device("xpu")) + has_eos = has_eos.to("xpu") + ones_mask = ones_mask.to("xpu") + model = model.to("xpu") total_time = 0 for step in range(max_length): - - if isinstance(model, torch.nn.Module): + if isinstance(model, torch.nn.Module) and device == 'cpu': start = time.time() next_token_logits = torch.nn.functional.softmax( model( @@ -79,6 +88,14 @@ def generate_text( attention_mask=all_attention_masks)[:, -1, :], dim=1) end = time.time() total_time += end - start + elif isinstance(model, torch.nn.Module) and device == 'xpu': + start = time.time() + next_token_logits = torch.nn.functional.softmax( + model( + input_ids=all_token_ids, + attention_mask=all_attention_masks)[0][:, -1, :], dim=1) + end = time.time() + total_time += end - start elif isinstance(model, ort.InferenceSession): ort_input = { "input_ids": np.array(all_token_ids), @@ -101,7 +118,7 @@ def generate_text( all_token_ids = torch.cat( [all_token_ids, tokens_to_add.unsqueeze(-1)], dim=-1) all_attention_masks = torch.cat( - [all_attention_masks, torch.ones([1, 1])], dim=-1).type_as(all_attention_masks) + [all_attention_masks, ones_mask], dim=-1).type_as(all_attention_masks) if step > min_length and next_tokens == eos_token_id: break @@ -115,8 +132,8 @@ def generate( max_length: int = 10, prompt_file: str = None, benchmark_mode: bool = False, - n_runs: int = 100): - + n_runs: int = 100, + device: str = 'cpu'): # read prompts from file into a list for batch processing tokenized_input = [] if not benchmark_mode and prompt_file is not None: @@ -143,7 +160,8 @@ def generate( tokenized_prompt.attention_mask, min_length=max_length, max_length=max_length, - algorithm='greedy') + algorithm='greedy', + device=device) if i > 10: times.append(total_time) print(f"Average Generation time: {np.mean(times)}s") @@ -160,7 +178,8 @@ def generate( tokenized_prompt.attention_mask, min_length=min_length, max_length=max_length, - algorithm='sample') + algorithm='sample', + device=device) tokenized_output.append(res) out_json = [] @@ -216,7 +235,8 @@ def main(flags): min_length=min_length, max_length=max_length, prompt_file=prompt_file, - benchmark_mode=flags.benchmark_mode + benchmark_mode=flags.benchmark_mode, + device=flags.device ) @@ -241,6 +261,11 @@ def main(flags): type=int, default=10 ) + parser.add_argument('--device', + type=str, + required=False, + default='cpu', + help='Choose run on cpu or xpu, default is cpu.') FLAGS = parser.parse_args() main(FLAGS) diff --git a/src/transformers_xpu.patch b/src/transformers_xpu.patch new file mode 100644 index 0000000..57d5a30 --- /dev/null +++ b/src/transformers_xpu.patch @@ -0,0 +1,20 @@ +--- a.py 2023-04-04 11:50:27.000000000 +0000 ++++ b.py 2023-04-04 11:52:00.000000000 +0000 +@@ -53,6 +53,7 @@ + requires_backends, + ) + ++import intel_extension_for_pytorch + + if is_torch_available(): + import torch +@@ -1466,6 +1467,8 @@ + torch.distributed.init_process_group( + backend=self.xpu_backend, rank=rank, world_size=size, timeout=self.ddp_timeout_delta + ) ++ elif torch.xpu.is_available(): ++ device = torch.device("xpu") + elif is_torch_tpu_available(): + device = xm.xla_device() + self._n_gpu = 0 +~