From 29360acc033011606b1e89bd8b49f0ed73745df1 Mon Sep 17 00:00:00 2001 From: andrewssobral Date: Sun, 5 Mar 2023 11:04:25 +0000 Subject: [PATCH] Added Gradio Web Interface for LLaMA --- virtualenv.sh | 16 +++++++++ webapp.py | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++ webapp.sh | 9 +++++ 3 files changed, 122 insertions(+) create mode 100755 virtualenv.sh create mode 100644 webapp.py create mode 100755 webapp.sh diff --git a/virtualenv.sh b/virtualenv.sh new file mode 100755 index 00000000..2aa0cf26 --- /dev/null +++ b/virtualenv.sh @@ -0,0 +1,16 @@ +rm -rf llama_env +python3 -m venv llama_env +source llama_env/bin/activate + +python -m pip install --upgrade pip + +python -m pip install wheel +python setup.py bdist_wheel + +pip install -r requirements.txt +pip install -e . + +python -m pip install gradio + +# run webapp.sh + diff --git a/webapp.py b/webapp.py new file mode 100644 index 00000000..9eda6819 --- /dev/null +++ b/webapp.py @@ -0,0 +1,97 @@ +import os +import sys +import torch +import fire +import time +import json + +import gradio as gr + +from typing import Tuple +from pathlib import Path +from fairscale.nn.model_parallel.initialize import initialize_model_parallel +from llama import ModelArgs, Transformer, Tokenizer, LLaMA + + +ckpt_dir = "models/7B" +tokenizer_path = "models/tokenizer.model" +temperature = 0.8 +top_p = 0.95 +max_seq_len = 512 +max_batch_size = 32 + + +def setup_model_parallel() -> Tuple[int, int]: + local_rank = int(os.environ.get("LOCAL_RANK", -1)) + world_size = int(os.environ.get("WORLD_SIZE", -1)) + + torch.distributed.init_process_group("nccl") + initialize_model_parallel(world_size) + torch.cuda.set_device(local_rank) + + # seed must be the same in all processes + torch.manual_seed(1) + return local_rank, world_size + + +def load( + ckpt_dir: str, + tokenizer_path: str, + local_rank: int, + world_size: int, + max_seq_len: int, + max_batch_size: int, +) -> LLaMA: + start_time = time.time() + checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) + assert world_size == len( + checkpoints + ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}" + ckpt_path = checkpoints[local_rank] + print("Loading") + checkpoint = torch.load(ckpt_path, map_location="cpu") + with open(Path(ckpt_dir) / "params.json", "r") as f: + params = json.loads(f.read()) + + model_args: ModelArgs = ModelArgs( + max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params + ) + tokenizer = Tokenizer(model_path=tokenizer_path) + model_args.vocab_size = tokenizer.n_words + torch.set_default_tensor_type(torch.cuda.HalfTensor) + model = Transformer(model_args) + torch.set_default_tensor_type(torch.FloatTensor) + model.load_state_dict(checkpoint, strict=False) + + generator = LLaMA(model, tokenizer) + print(f"Loaded in {time.time() - start_time:.2f} seconds") + return generator + + +local_rank, world_size = setup_model_parallel() +if local_rank > 0: + sys.stdout = open(os.devnull, "w") + +generator = load( + ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size +) + + +def process(prompt: str): + print("Received:\n", prompt) + prompts = [prompt] + results = generator.generate( + prompts, max_gen_len=256, temperature=temperature, top_p=top_p + ) + print("Generated:\n", results[0]) + return str(results[0]) + + +demo = gr.Interface( + fn = process, + inputs = gr.Textbox(lines=10, placeholder="Your prompt here..."), + outputs = "text", +) + +# To create a public link, set `share=True` in `launch()`. +demo.launch(share=True) diff --git a/webapp.sh b/webapp.sh new file mode 100755 index 00000000..e75f3adc --- /dev/null +++ b/webapp.sh @@ -0,0 +1,9 @@ +# +# first build the virtualenv using the virtualenv.sh script +# +# gradio webapp.py +torchrun --nproc_per_node $MP webapp.py +# +# or use CUDA_VISIBLE_DEVICES if you want to target a specific gpu device +# CUDA_VISIBLE_DEVICES=1 torchrun --nproc_per_node $MP webapp.py +#