support generate

mit-han-lab · May 23, 2023 · 6eeced7 · 6eeced7
1 parent c92a22d
commit 6eeced7
Show file tree

Hide file tree

Showing 6 changed files with 600 additions and 155 deletions.
diff --git a/experimental/transformer/Makefile b/experimental/transformer/Makefile
@@ -3,7 +3,7 @@ CXX = g++
 CXXFLAGS = -std=c++17 -mavx2 -pthread -O3
 
 # Executable and source files
-TARGET = test_ops test_Int8OPTAttention test_Int8OPTDecoderLayer test_Int8OPTDecoder test_OPTForCausalLM profile_OPTForCausalLM test_OPTTokenizer
+TARGET = test_ops test_Int8OPTAttention test_Int8OPTDecoderLayer test_Int8OPTDecoder test_OPTForCausalLM profile_OPTForCausalLM test_OPTTokenizer test_OPTGenerate
 
 LIB_DIR = ../matmul_optimization/src
 LIB_SRC = $(wildcard $(LIB_DIR)/lib/*.cc)
@@ -42,6 +42,9 @@ profile_OPTForCausalLM:
 test_OPTTokenizer:
 	$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -D PROFILER -o test_OPTTokenizer tests/test_OPTTokenizer.cc $(SRC)
 
+test_OPTGenerate:
+	$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -D PROFILER -o test_OPTGenerate tests/test_OPTGenerate.cc $(SRC)
+
 # Clean up
 clean:
 	rm -f $(TARGET)
diff --git a/experimental/transformer/include/OPT.h b/experimental/transformer/include/OPT.h
diff --git a/experimental/transformer/include/OPTGenerate.h b/experimental/transformer/include/OPTGenerate.h
@@ -5,11 +5,119 @@
 #include <cassert>
 #include <unordered_map>
 #include <random>
+#include <algorithm>
+#include <iostream>
 
-#include "OPTTokenizer.h"
+#include "OPTForCausalLM.h"
+#include "operators.h"
+#include "utils.h"
 
-void OPTGenerate(std::vector<int> embd_inp, 
-                       const bool is_interacting, 
-          const struct opt_params params, 
-         const std::vector<float> _logits,
-                const OPT_vocab & vocab);
+inline std::mt19937 OPT_rng;
+
+inline int OPT_token_bos() {
+      return 0;
+}
+inline int OPT_token_eos() {
+      return 2;
+}
+inline int OPT_token_nl() {
+      return 3;  // TODO: To be checked
+}
+
+typedef struct OPT_token_data {
+    int id;       // token id
+    float logit;  // log-odds of the token
+    float p;      // probability of the token
+} OPT_token_data;
+
+typedef struct OPT_token_data_array {
+    OPT_token_data* data;
+    size_t size;
+    bool sorted;
+} OPT_token_data_array;
+
+struct opt_params {
+    int32_t seed = -1;  // RNG seed
+    // int32_t n_threads     = get_num_physical_cores(); // TODO: fix this
+    int32_t n_threads = 1;
+    int32_t n_predict = 128;  // new tokens to predict
+    int32_t n_parts = -1;    // amount of model parts (-1 = determine from model dimensions)
+    int32_t n_ctx = 512;     // context size
+    int32_t n_batch = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep = 0;      // number of tokens to keep from initial prompt
+    int32_t n_vocab = 50272;  // vocabulary size
+
+    // sampling parameters
+    std::unordered_map<int, float> logit_bias;  // logit bias for specific tokens
+    int32_t top_k = 40;                         // <= 0 to use vocab size
+    float top_p = 0.95f;                        // 1.0 = disabled
+    float tfs_z = 1.00f;                        // 1.0 = disabled
+    float typical_p = 1.00f;                    // 1.0 = disabled
+    float temp = 0.80f;                         // 1.0 = disabled
+    float repeat_penalty = 1.10f;               // 1.0 = disabled
+    int32_t repeat_last_n = 64;                 // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float frequency_penalty = 0.00f;            // 0.0 = disabled
+    float presence_penalty = 0.00f;             // 0.0 = disabled
+    int mirostat = 0;                           // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float mirostat_tau = 5.00f;                 // target entropy
+    float mirostat_eta = 0.10f;                 // learning rate
+
+    std::string model = "models/lamma-7B/ggml-model.bin";  // model path
+    std::string prompt = "";
+    std::string path_prompt_cache = "";   // path to file for saving/loading prompt eval state
+    std::string input_prefix = "";        // string to prefix user inputs with
+    std::string input_suffix = "";        // string to suffix user inputs with
+    std::vector<std::string> antiprompt;  // string upon seeing which more user input is prompted
+
+    std::string lora_adapter = "";  // lora adapter path
+    std::string lora_base = "";     // base model path for the lora adapter
+
+    bool memory_f16 = true;         // use f16 instead of f32 for memory kv
+    bool random_prompt = false;     // do not randomize prompt if none provided
+    bool use_color = false;         // use color to distinguish generations and inputs
+    bool interactive = false;       // interactive mode
+    bool prompt_cache_all = false;  // save user input and generations to prompt cache
+
+    bool embedding = false;          // get only sentence embedding
+    bool interactive_first = false;  // wait for user input immediately
+    bool multiline_input = false;    // reverse the usage of `\`
+
+    bool instruct = false;        // instruction mode (used for Alpaca models)
+    bool penalize_nl = true;      // consider newlines as a repeatable token
+    bool perplexity = false;      // compute perplexity over the prompt
+    bool use_mmap = true;         // use mmap for faster loads
+    bool use_mlock = false;       // use mlock to keep model in memory
+    bool mem_test = false;        // compute maximum memory usage
+    bool verbose_prompt = false;  // print prompt tokens before generation
+};
+
+void OPT_sample_repetition_penalty(OPT_token_data_array* candidates, const int* last_tokens, size_t last_tokens_size,
+                                   float penalty);
+
+void OPT_sample_frequency_and_presence_penalties(OPT_token_data_array* candidates, const int* last_tokens_p,
+                                                 size_t last_tokens_size, float alpha_frequency, float alpha_presence);
+
+int OPT_sample_token_greedy(OPT_token_data_array* candidates);
+
+void OPT_sample_temperature(OPT_token_data_array* candidates_p, float temp);
+
+void OPT_sample_softmax(OPT_token_data_array* candidates);
+
+int OPT_sample_token(OPT_token_data_array* candidates);
+
+void OPT_sample_top_k(OPT_token_data_array* candidates, int k, size_t min_keep);
+
+int OPT_sample_token_mirostat(const int n_vocab, OPT_token_data_array* candidates, float tau, float eta, int m,
+                              float* mu);
+
+int OPT_sample_token_mirostat_v2(OPT_token_data_array* candidates, float tau, float eta, float* mu);
+
+void OPT_sample_tail_free(OPT_token_data_array* candidates, float z, size_t min_keep);
+
+void OPT_sample_typical(OPT_token_data_array* candidates, float p, size_t min_keep);
+
+void OPT_sample_top_p(OPT_token_data_array* candidates, float p, size_t min_keep);
+
+
+std::vector<int> OPTGenerate(std::vector<int> input_ids,
+                      const struct opt_params generation_config);
diff --git a/experimental/transformer/include/OPTTokenizer.h b/experimental/transformer/include/OPTTokenizer.h
@@ -25,23 +25,6 @@
 
 //std::vector<int> OPT_tokenize(const OPT_vocab & vocab, const std::string & text, bool add_bos);
 
-
-/* TODO */
-inline int n_ctx = 1024;
-inline std::vector<int> last_n_tokens(n_ctx);
-
-inline int OPT_token_bos() {
-    return 1;
-}
-
-inline int OPT_token_eos() {
-    return 2;
-}
-
-inline int OPT_token_nl() {
-    return 13;
-}
-
 struct pair_hash {
     template <class T1, class T2>
     std::size_t operator () (const std::pair<T1,T2> &p) const {