Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
68504f0
readme : update games list (#8673)
MorganRO8 Jul 24, 2024
8a4bad5
llama: use sliding window for phi3 (#8627)
FanShupei Jul 25, 2024
4b0eff3
docs : Quantum -> Quantized (#8666)
Ujjawal-K-Panchal Jul 25, 2024
be6d7c0
examples : remove `finetune` and `train-text-from-scratch` (#8669)
ngxson Jul 25, 2024
eddcb52
ggml : add and use ggml_cpu_has_llamafile() (#8664)
ggerganov Jul 25, 2024
ed67bcb
[SYCL] fix multi-gpu issue on sycl (#8554)
ClarkChin08 Jul 25, 2024
9f2076b
fix rocminfo error
LostRuins Jul 25, 2024
88954f7
tests : fix printfs (#8068)
ggerganov Jul 25, 2024
bf5a81d
ggml : fix build on Windows with Snapdragon X (#8531)
AndreasKunar Jul 25, 2024
4226a8d
llama : fix build + fix fabs compile warnings (#8683)
ggerganov Jul 25, 2024
49ce0ab
ggml: handle ggml_init failure to fix NULL pointer deref (#8692)
DavidKorczynski Jul 25, 2024
41cd47c
examples : export-lora : fix issue with quantized base models (#8687)
ngxson Jul 25, 2024
01aec4a
server : add Speech Recognition & Synthesis to UI (#8679)
ElYaiko Jul 25, 2024
01245f5
llama : fix order of parameters (#8706)
foldl Jul 26, 2024
4531ab5
refactor some fields
LostRuins Jul 26, 2024
2b1f616
ggml : reduce hash table reset cost (#8698)
slaren Jul 27, 2024
729eb1e
no fast forward for empty prompt
LostRuins Jul 27, 2024
bfb4c74
cann: Fix Multi-NPU execution error (#8710)
wangshuai09 Jul 27, 2024
9d03d08
common : add --no-warmup option for main/llama-cli (#8712)
danbev Jul 27, 2024
92090ec
llama : add function for model-based max number of graph nodes (#8622)
ggerganov Jul 27, 2024
b5e9546
llama : add support for llama 3.1 rope scaling factors (#8676)
jmorganca Jul 27, 2024
eaa7028
increased padding, it is still way too little but whatever
LostRuins Jul 27, 2024
c12b6e8
ggml : remove unnecessary UNUSED macro call (ggml/880)
danbev Jul 8, 2024
d2b851b
cmake : only enable GGML_NATIVE and x86 flags if not crosscompiling (…
iboB Jul 12, 2024
203b7f1
vulkan : initialize vk_buffer_struct members to VK_NULL_HANDLE (ggml/…
neobrain Jul 20, 2024
9f77d89
ggml: add support for float16 input tensors in pooling operations (gg…
vanaka11 Jul 22, 2024
a05ca93
ggml : loop tiling optimizations for scalar path (ggml/898)
heshpdx Jul 25, 2024
ae7985c
sync : ggml
ggerganov Jul 27, 2024
345c8c0
ggml : add missing semicolon (#0)
ggerganov Jul 27, 2024
56f20aa
scripts : sync ggml-aarch64 sources
ggerganov Jul 27, 2024
5e2727f
scripts : sync vulkan-shaders (#0)
ggerganov Jul 27, 2024
ba5babb
Merge branch 'upstream' into concedo_experimental
LostRuins Jul 27, 2024
e54c35e
feat: Support Moore Threads GPU (#8383)
yeahdongcn Jul 27, 2024
01afb28
not working
LostRuins Jul 28, 2024
0029e36
fix for older phi3 models without swa
LostRuins Jul 28, 2024
edbdfbc
Revert "cu11 build threads"
LostRuins Jul 28, 2024
e47477f
don't build rope factors from https://github.com/ggerganov/llama.cpp/…
LostRuins Jul 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1325,6 +1325,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
else { invalid_param = true; }
return true;
}
if (arg == "--no-warmup") {
params.warmup = false;
return true;
}
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
if (log_param_single_parse(argv[i])) {
Expand Down Expand Up @@ -1447,6 +1451,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
options.push_back({ "server infill",
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });

Expand Down
29 changes: 29 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1570,6 +1570,34 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
return [(self.map_tensor_name(name), data_torch)]

def prepare_tensors(self):
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
if rope_scaling.get("rope_type", '').lower() == "llama3":
base = self.hparams.get("rope_theta", 10000.0)
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))

factor = rope_scaling.get("factor", 8.0)
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)

low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
assert low_freq_wavelen != high_freq_wavelen

rope_factors = []
for freq in freqs:
wavelen = 2 * math.pi / freq
if wavelen < high_freq_wavelen:
rope_factors.append(1)
elif wavelen > low_freq_wavelen:
rope_factors.append(factor)
else:
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
rope_factors.append(1 / ((1 - smooth) / factor + smooth))

self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))

super().prepare_tensors()

if self._experts is not None:
Expand Down Expand Up @@ -2084,6 +2112,7 @@ def set_gguf_parameters(self):
self.gguf_writer.add_rope_dimension_count(rope_dims)
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))

# write rope scaling for long context (128k) model
rope_scaling = self.find_hparam(['rope_scaling'], True)
Expand Down
2 changes: 0 additions & 2 deletions examples/deprecation-warning/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ Please update all scripts and workflows to use the new binary names.
| server | llama-server |
| llama-bench | llama-bench |
| embedding | llama-embedding |
| finetune | llama-finetune |
| quantize | llama-quantize |
| tokenize | llama-tokenize |
| export-lora | llama-export-lora |
Expand Down Expand Up @@ -45,7 +44,6 @@ Please update all scripts and workflows to use the new binary names.
| save-load-state | llama-save-load-state |
| simple | llama-simple |
| speculative | llama-speculative |
| train-text-from-scratch | llama-train-text-from-scratch |
| vdot | llama-vdot |
| tests/test-c.o | tests/test-c.o |

2 changes: 1 addition & 1 deletion examples/eval-callback/eval-callback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
} else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) &data[i];
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
printf("%12.4f", v);
sum += v;
Expand Down
12 changes: 10 additions & 2 deletions examples/export-lora/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,15 @@ For example:
./bin/llama-export-lora \
-m open-llama-3b-v2-q8_0.gguf \
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
--lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
--lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf
```

Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters.
Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:

```bash
./bin/llama-export-lora \
-m your_base_model.gguf \
-o your_merged_model.gguf \
--lora-scaled lora_task_A.gguf 0.5 \
--lora-scaled lora_task_B.gguf 0.5
```
64 changes: 39 additions & 25 deletions examples/export-lora/export-lora.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,9 @@ struct lora_merge_ctx {
}
}

// if true, this tensor can be lora-merged. if false, we skip merging and just copy data to outfile
std::vector<std::pair<struct ggml_tensor *, bool>> base_tensors;
// mapping base tensor to out tensor (same shape with base, but different type)
// if out_tensor == nullptr, we only copy it
std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
for (auto & it : base_model.tensors) {
bool t_a = true;
bool t_b = true;
Expand All @@ -221,22 +222,22 @@ struct lora_merge_ctx {
t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
}
auto base_tensor = it.second;
struct ggml_tensor * out_tensor;
if (!t_a && !t_b) {
// only copy
out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
ggml_set_name(out_tensor, base_tensor->name);
base_tensors.push_back(std::make_pair(out_tensor, false));
struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
ggml_set_name(cpy_tensor, base_tensor->name);
base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
gguf_add_tensor(ctx_out, cpy_tensor);
} else if (t_a && t_b) {
// need merging
out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
out_tensor->type = get_out_tensor_type(base_tensor);
struct ggml_tensor * out_tensor = ggml_new_tensor(
ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
ggml_set_name(out_tensor, base_tensor->name);
base_tensors.push_back(std::make_pair(out_tensor, true));
base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
gguf_add_tensor(ctx_out, out_tensor);
} else {
throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
}
gguf_add_tensor(ctx_out, out_tensor);
}

// placeholder for the meta data
Expand All @@ -247,9 +248,9 @@ struct lora_merge_ctx {

// process base model tensors
size_t n_merged = 0;
for (auto & it : base_tensors) {
if (it.second) {
merge_tensor(it.first);
for (auto & it : base_to_out_tensors) {
if (it.second != nullptr) {
merge_tensor(it.first, it.second);
n_merged++;
} else {
copy_tensor(it.first);
Expand All @@ -265,7 +266,7 @@ struct lora_merge_ctx {
}

printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
printf("%s : wrote %ld tensors to output file\n", __func__, base_tensors.size());
printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
}

void copy_tensor(struct ggml_tensor * base) {
Expand All @@ -276,7 +277,7 @@ struct lora_merge_ctx {
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
}

void merge_tensor(struct ggml_tensor * base) {
void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
std::string name_base(base->name);
std::string name_lora_a = name_base + ".lora_a";
std::string name_lora_b = name_base + ".lora_b";
Expand All @@ -287,14 +288,14 @@ struct lora_merge_ctx {
std::vector<struct ggml_tensor *> inp_a(adapters.size());
std::vector<struct ggml_tensor *> inp_b(adapters.size());
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead()*(1+adapters.size()*2),
/*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
struct ggml_context * ctx = ggml_init(params);

// alloc tensors
struct ggml_tensor * inp = ggml_dup_tensor(ctx, base);
struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
for (size_t i = 0; i < adapters.size(); ++i) {
auto t_a = adapters[i]->get_tensor(name_lora_a);
auto t_b = adapters[i]->get_tensor(name_lora_b);
Expand All @@ -303,9 +304,21 @@ struct lora_merge_ctx {
}
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);

// load data to backend buffer
// load base tensor to backend buffer
base_model.read_tensor_data(name_base, read_buf);
ggml_backend_tensor_set(inp, read_buf.data(), 0, ggml_nbytes(inp));
if (base->type != GGML_TYPE_F32) {
// optionally dequantize it
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
auto nels = ggml_nelements(inp_base);
ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
} else {
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
}

// load lora tensors to backend buffer
for (size_t i = 0; i < adapters.size(); ++i) {
adapters[i]->read_tensor_data(name_lora_a, read_buf);
ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
Expand All @@ -325,20 +338,21 @@ struct lora_merge_ctx {
};
struct ggml_context * ctx0 = ggml_init(params0);
gf = ggml_new_graph(ctx0);
struct ggml_tensor * cur = inp;
struct ggml_tensor * cur = inp_base;
for (size_t i = 0; i < adapters.size(); ++i) {
struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, inp_a[i]));
struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, inp_b[i]);
struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
// scale
const float alpha = adapters[i]->alpha;
const float rank = (float) inp_b[i]->ne[0];
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
delta = ggml_scale(ctx0, delta, scale);
cur = ggml_add(ctx0, cur, delta);
printf("%s : + merging from adapter[%ld]\n", __func__, i);
cur = ggml_add(ctx0, delta, cur);
printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
}
cur = ggml_cast(ctx0, cur, get_out_tensor_type(base));
cur = ggml_cast(ctx0, cur, out->type);
printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type));
ggml_build_forward_expand(gf, cur);
ggml_free(ctx0);
}
Expand Down
90 changes: 0 additions & 90 deletions examples/finetune/README.md

This file was deleted.

Loading