/
index.html
196 lines (171 loc) · 7.23 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>wllama.cpp demo</title>
<style>
body {
background-color: rgb(55, 55, 55);
color: rgb(222, 222, 222);
font-family: 'Courier New', Courier, monospace;
padding: 1em;
}
</style>
</head>
<body>
<div id="output"></div>
<script type="module">
import { Wllama, LoggerWithoutDebug } from '../../esm/index.js';
const CONFIG_PATHS = {
'single-thread/wllama.js' : '../../esm/single-thread/wllama.js',
'single-thread/wllama.wasm' : '../../esm/single-thread/wllama.wasm',
'multi-thread/wllama.js' : '../../esm/multi-thread/wllama.js',
'multi-thread/wllama.wasm' : '../../esm/multi-thread/wllama.wasm',
'multi-thread/wllama.worker.mjs': '../../esm/multi-thread/wllama.worker.mjs',
};
const MODEL = 'https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories15M-q4_0.gguf';
const MODEL_SPLITS = [
'https://huggingface.co/ngxson/tinyllama_split_test/resolve/main/stories15M-q8_0-00001-of-00003.gguf',
'https://huggingface.co/ngxson/tinyllama_split_test/resolve/main/stories15M-q8_0-00002-of-00003.gguf',
'https://huggingface.co/ngxson/tinyllama_split_test/resolve/main/stories15M-q8_0-00003-of-00003.gguf',
];
// Or, try loading a bigger model (1.3GB in total)
/*const MODEL_SPLITS = [
'https://huggingface.co/ngxson/test_gguf_models/resolve/main/neuralreyna-mini-1.8b-v0.3.q4_k_m-00001-of-00005.gguf',
'https://huggingface.co/ngxson/test_gguf_models/resolve/main/neuralreyna-mini-1.8b-v0.3.q4_k_m-00002-of-00005.gguf',
'https://huggingface.co/ngxson/test_gguf_models/resolve/main/neuralreyna-mini-1.8b-v0.3.q4_k_m-00003-of-00005.gguf',
'https://huggingface.co/ngxson/test_gguf_models/resolve/main/neuralreyna-mini-1.8b-v0.3.q4_k_m-00004-of-00005.gguf',
'https://huggingface.co/ngxson/test_gguf_models/resolve/main/neuralreyna-mini-1.8b-v0.3.q4_k_m-00005-of-00005.gguf',
];*/
async function main() {
let res, tokens, elapsed, buffer;
const wllama = new Wllama(CONFIG_PATHS, {
logger: LoggerWithoutDebug,
});
print(`Loading model ${MODEL}`);
timeStart();
await wllama.loadModelFromUrl(MODEL, {
embeddings: true,
n_ctx: 1024,
progressCallback: ({ loaded, total }) => console.log(`Downloading... ${Math.round(loaded/total*100)}%`),
});
print(`Loaded, take ${timeEnd()} ms`);
print(`Metadata = ${JSON.stringify(wllama.getModelMetadata().meta, null, 2)}`);
print(`BOS token = ${wllama.getBOS()}`);
print(`EOS token = ${wllama.getEOS()}`);
print(`\n--------------\n`);
const CONFIG_SAMPLING = {
temp: 0.2,
top_p: 0.95,
top_p: 40,
};
print(`Initialize sampling with params: ${JSON.stringify(CONFIG_SAMPLING)}`);
await wllama.samplingInit(CONFIG_SAMPLING);
print(`\n--------------\n`);
const TEXT_TO_TOKENIZE = 'Once Upon a Time is an American fantasy adventure drama television series that aired for seven seasons on ABC from October 23, 2011, to May 18, 2018. The action alternates between two main settings';
print(`Tokenize "${TEXT_TO_TOKENIZE}"`);
timeStart();
tokens = await wllama.tokenize(TEXT_TO_TOKENIZE);
elapsed = timeEnd();
print(`Time taken ${elapsed} ms, speed = ${Math.floor((tokens.length/elapsed)*1000)}t/s`);
print(`Result: [${tokens.join(', ')}]`);
print(`\n--------------\n`);
print(`Detokenize the result above`);
timeStart();
const textDecoder = new TextDecoder();
buffer = await wllama.detokenize(tokens);
elapsed = timeEnd();
print(`Time taken ${elapsed} ms, speed = ${Math.floor((tokens.length/elapsed)*1000)}t/s`);
print(`Result: "${textDecoder.decode(buffer)}"`);
print(`\n--------------\n`);
const INPUT_PROMPT = 'Once upon a time';
tokens = await wllama.tokenize(INPUT_PROMPT);
print(`Evaluate the prompt: "${INPUT_PROMPT}" (${tokens.length} tokens)`);
timeStart();
res = await wllama.decode(tokens, {});
elapsed = timeEnd();
print(`Time taken ${elapsed} ms, speed = ${Math.floor((tokens.length/elapsed)*1000)}t/s`);
const N_PREDICT = 10;
print(`Generate next ${N_PREDICT} tokens:`);
async function genNTokens() {
for (let i = 0; i < N_PREDICT; i++) {
let { piece, token } = await wllama.samplingSample();
print(`Token ID ${token} => "${textDecoder.decode(piece)}"`);
await wllama.decode([token], {});
tokens.push(token);
}
}
timeStart();
await genNTokens();
elapsed = timeEnd();
print(`Time taken ${elapsed} ms, speed = ${Math.floor((N_PREDICT/elapsed)*1000)}t/s`);
print(`\n--------------\n`);
const NEW_TEMPERATURE = 0.9;
print(`Change sampling temperature to ${NEW_TEMPERATURE}`);
await wllama.samplingInit(
{
...CONFIG_SAMPLING,
temp: NEW_TEMPERATURE, // set new temperature
},
tokens // don't forget to give it all the past tokens, since we're creating a new sampling context
);
print(`Generate next ${N_PREDICT} tokens with temperature ${NEW_TEMPERATURE}`);
await genNTokens();
print(`\n--------------\n`);
print(`Get logits of next token:`);
const logits = await wllama.getLogits(10);
for (const candidate of logits) {
const word = textDecoder.decode(await wllama.detokenize([candidate.token]));
print(`Token ID ${candidate.token}, probability = ${candidate.p} ==> "${word}"`);
}
print(`\n--------------\n`);
const wllamaSplits = new Wllama(CONFIG_PATHS);
print(`Loading split model...`);
timeStart();
await wllamaSplits.loadModelFromUrl(MODEL_SPLITS, {
embeddings: true,
n_ctx: 1024,
parallelDownloads: 5,
progressCallback: ({ loaded, total }) => console.log(`Downloading... ${Math.round(loaded/total*100)}%`),
});
print(`Loaded, take ${timeEnd()} ms`);
print(`Test generating text...`);
const output = await wllamaSplits.createCompletion(TEXT_TO_TOKENIZE, {
nPredict: 20,
});
print(output);
// const debugInfo = await wllamaSplits._getDebugInfo();
// console.log(debugInfo)
// print(JSON.stringify(debugInfo, null, 2));
}
/////////////////////////////////////////////////////////////////////
const elemOutput = document.getElementById('output');
function print(message, bold) {
const elem = document.createElement('div');
if (bold) {
const b = document.createElement('b');
b.innerText = message;
elem.appendChild(b);
} else {
elem.innerText = message;
}
elemOutput.appendChild(elem);
// scroll to bottom
setTimeout(() => window.scrollTo({
top: document.documentElement.scrollHeight - window.innerHeight,
left: 0,
behavior: 'smooth',
}), 10);
}
let __startTime = 0;
function timeStart() {
__startTime = Date.now();
}
function timeEnd() {
return Date.now() - __startTime;
}
main();
</script>
</body>
</html>