Skip to content

Commit

Permalink
support generate
Browse files Browse the repository at this point in the history
  • Loading branch information
RaymondWang0 committed May 23, 2023
1 parent c92a22d commit 6eeced7
Show file tree
Hide file tree
Showing 6 changed files with 600 additions and 155 deletions.
5 changes: 4 additions & 1 deletion experimental/transformer/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ CXX = g++
CXXFLAGS = -std=c++17 -mavx2 -pthread -O3

# Executable and source files
TARGET = test_ops test_Int8OPTAttention test_Int8OPTDecoderLayer test_Int8OPTDecoder test_OPTForCausalLM profile_OPTForCausalLM test_OPTTokenizer
TARGET = test_ops test_Int8OPTAttention test_Int8OPTDecoderLayer test_Int8OPTDecoder test_OPTForCausalLM profile_OPTForCausalLM test_OPTTokenizer test_OPTGenerate

LIB_DIR = ../matmul_optimization/src
LIB_SRC = $(wildcard $(LIB_DIR)/lib/*.cc)
Expand Down Expand Up @@ -42,6 +42,9 @@ profile_OPTForCausalLM:
test_OPTTokenizer:
$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -D PROFILER -o test_OPTTokenizer tests/test_OPTTokenizer.cc $(SRC)

test_OPTGenerate:
$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -D PROFILER -o test_OPTGenerate tests/test_OPTGenerate.cc $(SRC)

# Clean up
clean:
rm -f $(TARGET)
131 changes: 0 additions & 131 deletions experimental/transformer/include/OPT.h

This file was deleted.

120 changes: 114 additions & 6 deletions experimental/transformer/include/OPTGenerate.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,119 @@
#include <cassert>
#include <unordered_map>
#include <random>
#include <algorithm>
#include <iostream>

#include "OPTTokenizer.h"
#include "OPTForCausalLM.h"
#include "operators.h"
#include "utils.h"

void OPTGenerate(std::vector<int> embd_inp,
const bool is_interacting,
const struct opt_params params,
const std::vector<float> _logits,
const OPT_vocab & vocab);
inline std::mt19937 OPT_rng;

inline int OPT_token_bos() {
return 0;
}
inline int OPT_token_eos() {
return 2;
}
inline int OPT_token_nl() {
return 3; // TODO: To be checked
}

typedef struct OPT_token_data {
int id; // token id
float logit; // log-odds of the token
float p; // probability of the token
} OPT_token_data;

typedef struct OPT_token_data_array {
OPT_token_data* data;
size_t size;
bool sorted;
} OPT_token_data_array;

struct opt_params {
int32_t seed = -1; // RNG seed
// int32_t n_threads = get_num_physical_cores(); // TODO: fix this
int32_t n_threads = 1;
int32_t n_predict = 128; // new tokens to predict
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_vocab = 50272; // vocabulary size

// sampling parameters
std::unordered_map<int, float> logit_bias; // logit bias for specific tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // 1.0 = disabled
float repeat_penalty = 1.10f; // 1.0 = disabled
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float frequency_penalty = 0.00f; // 0.0 = disabled
float presence_penalty = 0.00f; // 0.0 = disabled
int mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate

std::string model = "models/lamma-7B/ggml-model.bin"; // model path
std::string prompt = "";
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted

std::string lora_adapter = ""; // lora adapter path
std::string lora_base = ""; // base model path for the lora adapter

bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool prompt_cache_all = false; // save user input and generations to prompt cache

bool embedding = false; // get only sentence embedding
bool interactive_first = false; // wait for user input immediately
bool multiline_input = false; // reverse the usage of `\`

bool instruct = false; // instruction mode (used for Alpaca models)
bool penalize_nl = true; // consider newlines as a repeatable token
bool perplexity = false; // compute perplexity over the prompt
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool verbose_prompt = false; // print prompt tokens before generation
};

void OPT_sample_repetition_penalty(OPT_token_data_array* candidates, const int* last_tokens, size_t last_tokens_size,
float penalty);

void OPT_sample_frequency_and_presence_penalties(OPT_token_data_array* candidates, const int* last_tokens_p,
size_t last_tokens_size, float alpha_frequency, float alpha_presence);

int OPT_sample_token_greedy(OPT_token_data_array* candidates);

void OPT_sample_temperature(OPT_token_data_array* candidates_p, float temp);

void OPT_sample_softmax(OPT_token_data_array* candidates);

int OPT_sample_token(OPT_token_data_array* candidates);

void OPT_sample_top_k(OPT_token_data_array* candidates, int k, size_t min_keep);

int OPT_sample_token_mirostat(const int n_vocab, OPT_token_data_array* candidates, float tau, float eta, int m,
float* mu);

int OPT_sample_token_mirostat_v2(OPT_token_data_array* candidates, float tau, float eta, float* mu);

void OPT_sample_tail_free(OPT_token_data_array* candidates, float z, size_t min_keep);

void OPT_sample_typical(OPT_token_data_array* candidates, float p, size_t min_keep);

void OPT_sample_top_p(OPT_token_data_array* candidates, float p, size_t min_keep);


std::vector<int> OPTGenerate(std::vector<int> input_ids,
const struct opt_params generation_config);
17 changes: 0 additions & 17 deletions experimental/transformer/include/OPTTokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,23 +25,6 @@

//std::vector<int> OPT_tokenize(const OPT_vocab & vocab, const std::string & text, bool add_bos);


/* TODO */
inline int n_ctx = 1024;
inline std::vector<int> last_n_tokens(n_ctx);

inline int OPT_token_bos() {
return 1;
}

inline int OPT_token_eos() {
return 2;
}

inline int OPT_token_nl() {
return 13;
}

struct pair_hash {
template <class T1, class T2>
std::size_t operator () (const std::pair<T1,T2> &p) const {
Expand Down
Loading

0 comments on commit 6eeced7

Please sign in to comment.