From 365c6ab385e8300813366f0ffae76cbac9eb9bd0 Mon Sep 17 00:00:00 2001
From: Charlie Ruan <53290280+CharlieFRuan@users.noreply.github.com>
Date: Thu, 18 Apr 2024 14:05:38 -0400
Subject: [PATCH 1/3] Add conv template and model preset

---
 python/mlc_llm/conversation_template.py | 25 +++++++++++++
 python/mlc_llm/model/model_preset.py    | 50 +++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/python/mlc_llm/conversation_template.py b/python/mlc_llm/conversation_template.py
index 1b2a06feab..fa4bee86d4 100644
--- a/python/mlc_llm/conversation_template.py
+++ b/python/mlc_llm/conversation_template.py
@@ -36,6 +36,31 @@ def get_conv_template(name: str) -> Optional[Conversation]:
 
 ############## Preset Conversation Templates ##############
 
+# Llama3
+# See https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models
+# and https://github.com/meta-llama/llama3/blob/main/llama/tokenizer.py
+ConvTemplateRegistry.register_conv_template(
+    Conversation(
+        name="llama-3",
+        system_template=(
+            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n",
+            f"{MessagePlaceholders.SYSTEM.value}<|eot_id|>",
+        ),
+        system_message="You are a helpful, respectful and honest assistant.",
+        roles={
+            "user": "<|start_header_id|>user",
+            "assistant": "<|start_header_id|>assistant",
+            "tool": "<|start_header_id|>user",
+        },
+        seps=["<|eot_id|>"],
+        role_content_sep="<|end_header_id|>\n\n",
+        role_empty_sep="<|end_header_id|>\n\n",
+        stop_str=["<|end_of_text|>", "<|eot_id|>"],
+        stop_token_ids=[128001, 128009],  # "<|end_of_text|>", "<|eot_id|>"
+        system_prefix_token_ids=[128000],  # "<|begin_of_text|>"
+    )
+)
+
 # Llama2
 ConvTemplateRegistry.register_conv_template(
     Conversation(
diff --git a/python/mlc_llm/model/model_preset.py b/python/mlc_llm/model/model_preset.py
index 3bfe1cb891..41abf0292c 100644
--- a/python/mlc_llm/model/model_preset.py
+++ b/python/mlc_llm/model/model_preset.py
@@ -660,4 +660,54 @@
         "eos_token_id": 2,
         "pad_token_id": 0,
     },
+    "llama3_8b": {
+        "architectures": ["LlamaForCausalLM"],
+        "attention_bias": False,
+        "attention_dropout": 0.0,
+        "bos_token_id": 128000,
+        "eos_token_id": 128001,
+        "hidden_act": "silu",
+        "hidden_size": 4096,
+        "initializer_range": 0.02,
+        "intermediate_size": 14336,
+        "max_position_embeddings": 8192,
+        "model_type": "llama",
+        "num_attention_heads": 32,
+        "num_hidden_layers": 32,
+        "num_key_value_heads": 8,
+        "pretraining_tp": 1,
+        "rms_norm_eps": 1e-05,
+        "rope_scaling": None,
+        "rope_theta": 500000.0,
+        "tie_word_embeddings": False,
+        "torch_dtype": "bfloat16",
+        "transformers_version": "4.40.0.dev0",
+        "use_cache": True,
+        "vocab_size": 128256,
+    },
+    "llama3_70b": {
+        "architectures": ["LlamaForCausalLM"],
+        "attention_bias": False,
+        "attention_dropout": 0.0,
+        "bos_token_id": 128000,
+        "eos_token_id": 128001,
+        "hidden_act": "silu",
+        "hidden_size": 8192,
+        "initializer_range": 0.02,
+        "intermediate_size": 28672,
+        "max_position_embeddings": 8192,
+        "model_type": "llama",
+        "num_attention_heads": 64,
+        "num_hidden_layers": 80,
+        "num_key_value_heads": 8,
+        "pretraining_tp": 1,
+        "rms_norm_eps": 1e-05,
+        "rope_scaling": None,
+        "rope_theta": 500000.0,
+        "tie_word_embeddings": False,
+        "torch_dtype": "bfloat16",
+        "transformers_version": "4.40.0.dev0",
+        "use_cache": True,
+        "vocab_size": 128256,
+    },
 }

From dfa21a35434d3bd20818e1cd9165d4bb48e647dc Mon Sep 17 00:00:00 2001
From: Charlie Ruan <53290280+CharlieFRuan@users.noreply.github.com>
Date: Thu, 18 Apr 2024 15:10:29 -0400
Subject: [PATCH 2/3] Fix conv template

---
 python/mlc_llm/conversation_template.py | 14 ++++++--------
 python/mlc_llm/interface/gen_config.py  |  1 +
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/python/mlc_llm/conversation_template.py b/python/mlc_llm/conversation_template.py
index fa4bee86d4..88db8b1a3d 100644
--- a/python/mlc_llm/conversation_template.py
+++ b/python/mlc_llm/conversation_template.py
@@ -39,25 +39,23 @@ def get_conv_template(name: str) -> Optional[Conversation]:
 # Llama3
 # See https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models
 # and https://github.com/meta-llama/llama3/blob/main/llama/tokenizer.py
+# Llama3
 ConvTemplateRegistry.register_conv_template(
     Conversation(
         name="llama-3",
         system_template=(
-            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n",
-            f"{MessagePlaceholders.SYSTEM.value}<|eot_id|>",
+            "<|start_header_id|>system<|end_header_id|>\n\n",
+            f"{MessagePlaceholders.SYSTEM.value}",
         ),
         system_message="You are a helpful, respectful and honest assistant.",
-        roles={
-            "user": "<|start_header_id|>user",
-            "assistant": "<|start_header_id|>assistant",
-            "tool": "<|start_header_id|>user",
-        },
-        seps=["<|eot_id|>"],
+        roles={"user": "user", "assistant": "assistant"},
+        seps=["<|eot_id|><|start_header_id|>"],
         role_content_sep="<|end_header_id|>\n\n",
         role_empty_sep="<|end_header_id|>\n\n",
         stop_str=["<|end_of_text|>", "<|eot_id|>"],
         stop_token_ids=[128001, 128009],  # "<|end_of_text|>", "<|eot_id|>"
         system_prefix_token_ids=[128000],  # "<|begin_of_text|>"
+        add_role_after_system_message=True,
     )
 )
 
diff --git a/python/mlc_llm/interface/gen_config.py b/python/mlc_llm/interface/gen_config.py
index d22aa7d231..8e617fc3d2 100644
--- a/python/mlc_llm/interface/gen_config.py
+++ b/python/mlc_llm/interface/gen_config.py
@@ -274,6 +274,7 @@ def gen_config(  # pylint: disable=too-many-locals,too-many-arguments,too-many-b
 # FIXME: Copy RWKV tokenizer file # pylint: disable=fixme
 
 CONV_TEMPLATES = {
+    "llama-3",
     "chatml",
     "open_hermes_mistral",
     "neural_hermes_mistral",

From 80c72723af8d507528b8afb45ed82eee79589319 Mon Sep 17 00:00:00 2001
From: Charlie Ruan <53290280+CharlieFRuan@users.noreply.github.com>
Date: Thu, 18 Apr 2024 15:11:46 -0400
Subject: [PATCH 3/3] Trivial

---
 python/mlc_llm/conversation_template.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/mlc_llm/conversation_template.py b/python/mlc_llm/conversation_template.py
index 88db8b1a3d..fa926708d3 100644
--- a/python/mlc_llm/conversation_template.py
+++ b/python/mlc_llm/conversation_template.py
@@ -39,7 +39,6 @@ def get_conv_template(name: str) -> Optional[Conversation]:
 # Llama3
 # See https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models
 # and https://github.com/meta-llama/llama3/blob/main/llama/tokenizer.py
-# Llama3
 ConvTemplateRegistry.register_conv_template(
     Conversation(
         name="llama-3",