macrocosm-os · bkb2135 · Jun 3, 2024 · May 6, 2024 · May 6, 2024 · May 6, 2024
diff --git a/README.md b/README.md
@@ -1,3 +1,12 @@
+<picture>
+    <source srcset="./assets/macrocosmos-white.png"  media="(prefers-color-scheme: dark)">
+    <img src="macrocosmos-white.png">
+</picture>
+
+<picture>
+    <source srcset="./assets/macrocosmos-black.png"  media="(prefers-color-scheme: light)">
+    <img src="macrocosmos-black.png">
+</picture>
 
 <div align="center">
 
@@ -27,7 +36,7 @@ Validators and miners are based on large language models (LLM). The validation p
 </div>
 
 # Installation
-This repository requires python3.8 or higher. To install it, simply clone this repository and run the [install.sh](./install.sh) script.
+This repository requires python3.9 or higher. To install it, simply clone this repository and run the [install.sh](./install.sh) script.
 ```bash
 git clone https://github.com/opentensor/prompting.git
 cd prompting
@@ -54,7 +63,7 @@ python <SCRIPT_PATH>
     --wallet.name <your wallet> # Must be created using the bittensor-cli
     --wallet.hotkey <your hotkey> # Must be created using the bittensor-cli
     --logging.debug # Run in debug mode, alternatively --logging.trace for trace mode
-    --axon.port #VERY IMPORTANT: set the port to be one of the open TCP ports on your machine
+    --axon.port # VERY IMPORTANT: set the port to be one of the open TCP ports on your machine
 ```
 
 where `SCRIPT_PATH` is either: 
@@ -70,7 +79,7 @@ sudo apt update && sudo apt install jq && sudo apt install npm && sudo npm insta
 
 Example of running a SOLAR miner: 
 ```bash
-pm2 start neurons/miners/huggingface/miner.py --interpreter python3 --name solar_miner -- --netuid 1  --subtensor.network finney --wallet.name my_wallet --wallet.hotkey m1 --neuron.model_id NousResearch/Nous-Hermes-2-SOLAR-10.7B --axon.port 21988 --logging.debug 
+pm2 start neurons/miners/huggingface/miner.py --interpreter python3 --name solar_miner -- --netuid 1  --subtensor.network finney --wallet.name my_wallet --wallet.hotkey m1 --neuron.model_id casperhansen/llama-3-70b-instruct-awq --axon.port 21988 --logging.debug 
 ``` 
 
 # Testnet 
@@ -81,7 +90,7 @@ In order to run on testnet, you will need to go through the same hotkey registra
 To run:
 
 ```bash
-pm2 start neurons/miners/huggingface/miner.py --interpreter python3 --name solar_miner -- --netuid 61  --subtensor.network test --wallet.name my_test_wallet --wallet.hotkey m1 --neuron.model_id NousResearch/Nous-Hermes-2-SOLAR-10.7B --axon.port 21988 --logging.debug 
+pm2 start neurons/miners/huggingface/miner.py --interpreter python3 --name solar_miner -- --netuid 61  --subtensor.network test --wallet.name my_test_wallet --wallet.hotkey m1 --neuron.model_id casperhansen/llama-3-70b-instruct-awq --axon.port 21988 --logging.debug 
 ```
 
 # Limitations

diff --git a/assets/macrocosmos-black.png b/assets/macrocosmos-black.png
diff --git a/assets/macrocosmos-white.png b/assets/macrocosmos-white.png
diff --git a/min_compute.yml b/min_compute.yml
@@ -57,12 +57,12 @@ compute_spec:
 
     gpu:
       required: True                       # Does the application require a GPU?
-      min_vram: 24                         # Minimum GPU VRAM (GB)
-      recommended_vram: 36                 # Recommended GPU VRAM (GB)
+      min_vram: 80                         # Minimum GPU VRAM (GB)
+      recommended_vram: 80                 # Recommended GPU VRAM (GB)
       cuda_cores: 1024                     # Minimum number of CUDA cores (if applicable)
       min_compute_capability: 6.0          # Minimum CUDA compute capability
       recommended_compute_capability: 7.0  # Recommended CUDA compute capability
-      recommended_gpu: "NVIDIA A10"       # Recommended GPU to purchase/rent
+      recommended_gpu: "NVIDIA A100"       # Recommended GPU to purchase/rent
 
     memory:
       min_ram: 16          # Minimum RAM (GB)
@@ -71,7 +71,7 @@ compute_spec:
       ram_type: "DDR4"     # RAM type (e.g., DDR4, DDR3, etc.)
 
     storage:
-      min_space: 24           # Minimum free storage space (GB)
+      min_space: 40           # Minimum free storage space (GB)
       recommended_space: 100  # Recommended free storage space (GB)
       type: "SSD"             # Preferred storage type (e.g., SSD, HDD)
       min_iops: 1000          # Minimum I/O operations per second (if applicable)

diff --git a/prompting/__init__.py b/prompting/__init__.py
@@ -16,7 +16,7 @@
 # DEALINGS IN THE SOFTWARE.
 
 # Define the version of the template module.
-__version__ = "2.2.0"
+__version__ = "2.3.1"
 version_split = __version__.split(".")
 __spec_version__ = (
     (10000 * int(version_split[0]))

diff --git a/prompting/cleaners/all_cleaners.py b/prompting/cleaners/all_cleaners.py
@@ -56,6 +56,8 @@ def capitalize_sentences(self, input_string):
         sentences = re.split(r"(?<=[.!?])\s+", input_string)
         capitalized_sentences = [sentence.capitalize() for sentence in sentences]
         result_string = " ".join(capitalized_sentences)
+        # Capitalize the first letter in result_string
+        result_string.capitalize()
         return result_string
 
     def apply(self, generation: str) -> str:
@@ -101,4 +103,27 @@ def apply(self, generation: str, min_pos: Union[int,float] = 5, max_pos: Union[i
             # drop everything after the last question mark. Alternatively, we can just extract the first question.
             generation = generation.rsplit("?",1) + '?'
 
-        return generation 
+        return generation 
+
+class RemoveTags(BaseCleaner):
+    def __init__(self, **kwargs):
+        pass
+
+    def apply(self, generation: str) -> str:
+        tags = [
+            "<date>",]
+        for tag in tags:
+            if tag in generation:
+                generation = generation.replace(tag, "")
+        return generation
+
+class FirstQuestion(BaseCleaner):
+    def __init__(self, **kwargs):
+        pass
+
+    def apply(self, generation: str) -> str:
+        if "?" in generation:
+            if ':' in generation:
+                generation = generation.split(':')[1]
+            generation = generation.split("?")[0] + "?"
+        return generation
diff --git a/prompting/cleaners/cleaner.py b/prompting/cleaners/cleaner.py
@@ -2,13 +2,15 @@
 
 import bittensor as bt
 
-from prompting.cleaners.all_cleaners import RemoveQuotes, RemoveRoles, PruneEnding, PrunePostQuestionText
+from prompting.cleaners.all_cleaners import RemoveQuotes, RemoveRoles, PruneEnding, PrunePostQuestionText, RemoveTags, FirstQuestion
 
 SUPPORTED_CLEANERS = {
     "remove_quotes": RemoveQuotes,
     "remove_roles": RemoveRoles,
     "prune_ending": PruneEnding,
     "remove_post_question_text": PrunePostQuestionText,
+    "first_question": FirstQuestion,
+    "remove_tags": RemoveTags,
 }
 
 

diff --git a/prompting/llms/hf.py b/prompting/llms/hf.py
@@ -106,6 +106,8 @@ def __init__(
         mock=False,
         model_kwargs: dict = None,
         return_streamer: bool = False,
+        gpus: int = 1,
+        llm_max_allowed_memory_in_gb: int = 0
     ):
         super().__init__()
         self.model = model_id

diff --git a/prompting/llms/utils.py b/prompting/llms/utils.py
@@ -7,15 +7,7 @@ def contains_gpu_index_in_device(device: str) -> bool:
     pattern = r"^cuda:\d+$"
     return bool(re.match(pattern, device))
 
-
-def calculate_gpu_requirements(
-    device: str, max_allowed_memory_allocation_in_bytes: int = 20e9
-) -> float:
-    """Calculates the memory utilization requirements for the model to be loaded on the device.
-    Args:
-        device (str): The device to load the model to.
-        max_allowed_memory_allocation_in_bytes (int, optional): The maximum allowed memory allocation in bytes. Defaults to 20e9 (20GB).
-    """
+def calculate_single_gpu_requirements(device: str, max_allowed_memory_allocation_in_bytes: int):
     if contains_gpu_index_in_device(device):
         device_with_gpu_index = device
     else:
@@ -39,4 +31,44 @@ def calculate_gpu_requirements(
         f'{gpu_utilization * 100}% of the GPU memory will be utilized for loading the model to device "{device}".'
     )
 
+    return gpu_utilization    
+
+def calculate_multiple_gpu_requirements(device: str, gpus: int, max_allowed_memory_allocation_in_bytes: int):     
+    torch.cuda.synchronize()
+    total_free_memory = 0
+    total_gpu_memory = 0
+
+    for i in range(gpus):
+        gpu_device = f"cuda:{i}"
+        global_free, total_memory = torch.cuda.mem_get_info(device=gpu_device)
+        total_free_memory += global_free
+        total_gpu_memory += total_memory
+
+    bt.logging.info(f"Total available free memory across all visible {gpus} GPUs: {round(total_free_memory / 10e8, 2)} GB")
+    bt.logging.info(f"Total GPU memory across all visible GPUs: {gpus} {round(total_gpu_memory / 10e8, 2)} GB")
+
+    if total_free_memory < max_allowed_memory_allocation_in_bytes:
+        raise torch.cuda.CudaError(
+            f"Not enough memory across all specified {gpus} GPUs to allocate for the model. Please ensure you have at least {max_allowed_memory_allocation_in_bytes / 10e8} GB of free GPU memory."
+        )
+
+    gpu_utilization = round(max_allowed_memory_allocation_in_bytes / total_free_memory, 2)
+    bt.logging.info(
+        f"{gpu_utilization * 100}% of the total GPU memory across all GPUs will be utilized for loading the model."
+    )
+
     return gpu_utilization
+
+
+def calculate_gpu_requirements(
+    device: str, gpus: int, max_allowed_memory_allocation_in_bytes: float,
+) -> float:
+    """Calculates the memory utilization requirements for the model to be loaded on the device.
+    Args:
+        device (str): The device to load the model to.
+        max_allowed_memory_allocation_in_bytes (int, optional): The maximum allowed memory allocation in bytes. Defaults to 20e9 (20GB).
+    """
+    if gpus == 1:
+        return calculate_single_gpu_requirements(device, max_allowed_memory_allocation_in_bytes) 
+    else:    
+        return calculate_multiple_gpu_requirements(device, gpus, max_allowed_memory_allocation_in_bytes=max_allowed_memory_allocation_in_bytes)
diff --git a/prompting/llms/vllm_llm.py b/prompting/llms/vllm_llm.py
@@ -31,67 +31,48 @@ def clean_gpu_cache():
     destroy_model_parallel()
     gc.collect()
     torch.cuda.empty_cache()
-    torch.distributed.destroy_process_group()
+    if torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()
 
     # Wait for the GPU to clean up
     time.sleep(10)
     torch.cuda.synchronize()
 
 
-def load_vllm_pipeline(model_id: str, device: str, mock=False):
+def load_vllm_pipeline(model_id: str, device: str, gpus: int, max_allowed_memory_in_gb: int, mock=False):
     """Loads the VLLM pipeline for the LLM, or a mock pipeline if mock=True"""
     if mock or model_id == "mock":
         return MockPipeline(model_id)
 
-    # Calculates the gpu memory utilization required to run the model within 20GB of GPU
-    max_allowed_memory_in_gb = 20
+    # Calculates the gpu memory utilization required to run the model within 20GB of GPU    
     max_allowed_memory_allocation_in_bytes = max_allowed_memory_in_gb * 1e9
     gpu_mem_utilization = calculate_gpu_requirements(
-        device, max_allowed_memory_allocation_in_bytes
+        device, gpus, max_allowed_memory_allocation_in_bytes
     )
 
     try:
         # Attempt to initialize the LLM
-        return LLM(model=model_id, gpu_memory_utilization=gpu_mem_utilization)
-    except ValueError as e:
-        bt.logging.error(
-            f"Error loading the VLLM pipeline within {max_allowed_memory_in_gb}GB: {e}"
-        )
-
-    # If the first attempt fails, retry with increased memory allocation
-    try:
-        bt.logging.info(
-            "Trying to cleanup GPU and retrying to load the model with extra allocation..."
-        )
-        # Clean the GPU from memory before retrying
-        clean_gpu_cache()
-
-        # Increase the memory allocation for the second attempt
-        max_allowed_memory_in_gb_second_attempt = 24
-        max_allowed_memory_allocation_in_bytes = (
-            max_allowed_memory_in_gb_second_attempt * 1e9
-        )
-        bt.logging.warning(
-            f"Retrying to load with {max_allowed_memory_in_gb_second_attempt}GB..."
-        )
-        gpu_mem_utilization = calculate_gpu_requirements(
-            device, max_allowed_memory_allocation_in_bytes
-        )
-
-        # Attempt to initialize the LLM again with increased memory allocation
-        return LLM(model=model_id, gpu_memory_utilization=gpu_mem_utilization)
+        llm = LLM(model=model_id, gpu_memory_utilization = gpu_mem_utilization, quantization="AWQ", tensor_parallel_size=gpus)        
+        # This solution implemented by @bkb2135 sets the eos_token_id directly for efficiency in vLLM usage.
+        # This approach avoids the overhead of loading a tokenizer each time the custom eos token is needed.
+        # Using the Hugging Face pipeline, the eos token specific to llama models was fetched and saved (128009).
+        # This method provides a straightforward solution, though there may be more optimal ways to manage custom tokens.
+        llm.llm_engine.tokenizer.eos_token_id = 128009
+        return llm
     except Exception as e:
         bt.logging.error(
-            f"Error loading the VLLM pipeline within {max_allowed_memory_in_gb_second_attempt}GB: {e}"
+            f"Error loading the VLLM pipeline within {max_allowed_memory_in_gb}GB: {e}"
         )
         raise e
+
 
 
 class vLLMPipeline(BasePipeline):
-    def __init__(self, model_id: str, device: str = None, mock=False):
+    def __init__(self, model_id: str, llm_max_allowed_memory_in_gb: int, device: str = None, gpus: int = 1, mock: bool = False):
         super().__init__()
-        self.llm = load_vllm_pipeline(model_id, device, mock)
+        self.llm = load_vllm_pipeline(model_id, device, gpus, llm_max_allowed_memory_in_gb, mock)
         self.mock = mock
+        self.gpus = gpus
 
     def __call__(self, composed_prompt: str, **model_kwargs: Dict) -> str:
         if self.mock:
@@ -155,17 +136,17 @@ def _make_prompt(self, messages: List[Dict[str, str]]):
         for message in messages:
             if message["role"] == "system":
                 composed_prompt += (
-                    f'<|im_start|>system\n{message["content"]} <|im_end|>'
+                    f'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{{{{ {message["content"]} }}}}<|eot_id|>'
                 )
             elif message["role"] == "user":
-                composed_prompt += f'<|im_start|>user\n{message["content"]} <|im_end|>'
+                composed_prompt += f'<|start_header_id|>user<|end_header_id|>\n{{{{ {message["content"]} }}}}<|eot_id|>'
             elif message["role"] == "assistant":
                 composed_prompt += (
-                    f'<|im_start|>assistant\n{message["content"]} <|im_end|>'
+                    f'<|start_header_id|>assistant<|end_header_id|>\n{{{{ {message["content"]} }}}}<|eot_id|>'
                 )
 
         # Adds final tag indicating the assistant's turn
-        composed_prompt += "<|im_start|>assistant\n"
+        composed_prompt += "<|start_header_id|>assistant<|end_header_id|>"
         return composed_prompt
 
     def forward(self, messages: List[Dict[str, str]]):