examples/advanced/index.html

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>wllama.cpp demo</title>

  <style>
    body {
      background-color: rgb(55, 55, 55);
      color: rgb(222, 222, 222);
      font-family: 'Courier New', Courier, monospace;
      padding: 1em;
    }
  </style>
</head>
<body>

  <div id="output"></div>

  <script type="module">
    import { Wllama, LoggerWithoutDebug } from '../../esm/index.js';

    const CONFIG_PATHS = {
      'single-thread/wllama.js'       : '../../esm/single-thread/wllama.js',
      'single-thread/wllama.wasm'     : '../../esm/single-thread/wllama.wasm',
      'multi-thread/wllama.js'        : '../../esm/multi-thread/wllama.js',
      'multi-thread/wllama.wasm'      : '../../esm/multi-thread/wllama.wasm',
      'multi-thread/wllama.worker.mjs': '../../esm/multi-thread/wllama.worker.mjs',
    };
    const MODEL = 'https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories15M-q4_0.gguf';
    const MODEL_SPLITS = [
      'https://huggingface.co/ngxson/tinyllama_split_test/resolve/main/stories15M-q8_0-00001-of-00003.gguf',
      'https://huggingface.co/ngxson/tinyllama_split_test/resolve/main/stories15M-q8_0-00002-of-00003.gguf',
      'https://huggingface.co/ngxson/tinyllama_split_test/resolve/main/stories15M-q8_0-00003-of-00003.gguf',
    ];

    // Or, try loading a bigger model (1.3GB in total)
    /*const MODEL_SPLITS = [
      'https://huggingface.co/ngxson/test_gguf_models/resolve/main/neuralreyna-mini-1.8b-v0.3.q4_k_m-00001-of-00005.gguf',
      'https://huggingface.co/ngxson/test_gguf_models/resolve/main/neuralreyna-mini-1.8b-v0.3.q4_k_m-00002-of-00005.gguf',
      'https://huggingface.co/ngxson/test_gguf_models/resolve/main/neuralreyna-mini-1.8b-v0.3.q4_k_m-00003-of-00005.gguf',
      'https://huggingface.co/ngxson/test_gguf_models/resolve/main/neuralreyna-mini-1.8b-v0.3.q4_k_m-00004-of-00005.gguf',
      'https://huggingface.co/ngxson/test_gguf_models/resolve/main/neuralreyna-mini-1.8b-v0.3.q4_k_m-00005-of-00005.gguf',
    ];*/
    async function main() {
      let res, tokens, elapsed, buffer;
      const wllama = new Wllama(CONFIG_PATHS, {
        logger: LoggerWithoutDebug,
      });

      print(`Loading model ${MODEL}`);
      timeStart();
      await wllama.loadModelFromUrl(MODEL, {
        embeddings: true,
        n_ctx: 1024,
        progressCallback: ({ loaded, total }) => console.log(`Downloading... ${Math.round(loaded/total*100)}%`),
      });
      print(`Loaded, take ${timeEnd()} ms`);
      print(`Metadata = ${JSON.stringify(wllama.getModelMetadata().meta, null, 2)}`);
      print(`BOS token = ${wllama.getBOS()}`);
      print(`EOS token = ${wllama.getEOS()}`);
  
      print(`\n--------------\n`);

      const CONFIG_SAMPLING = {
        temp: 0.2,
        top_p: 0.95,
        top_p: 40,
      };
      print(`Initialize sampling with params: ${JSON.stringify(CONFIG_SAMPLING)}`);
      await wllama.samplingInit(CONFIG_SAMPLING);

      print(`\n--------------\n`);

      const TEXT_TO_TOKENIZE = 'Once Upon a Time is an American fantasy adventure drama television series that aired for seven seasons on ABC from October 23, 2011, to May 18, 2018. The action alternates between two main settings';
      print(`Tokenize "${TEXT_TO_TOKENIZE}"`);
      timeStart();
      tokens = await wllama.tokenize(TEXT_TO_TOKENIZE);
      elapsed = timeEnd();
      print(`Time taken ${elapsed} ms, speed = ${Math.floor((tokens.length/elapsed)*1000)}t/s`);
      print(`Result: [${tokens.join(', ')}]`);

      print(`\n--------------\n`);

      print(`Detokenize the result above`);
      timeStart();
      const textDecoder = new TextDecoder();
      buffer = await wllama.detokenize(tokens);
      elapsed = timeEnd();
      print(`Time taken ${elapsed} ms, speed = ${Math.floor((tokens.length/elapsed)*1000)}t/s`);
      print(`Result: "${textDecoder.decode(buffer)}"`);

      print(`\n--------------\n`);

      const INPUT_PROMPT = 'Once upon a time';
      tokens = await wllama.tokenize(INPUT_PROMPT);
      print(`Evaluate the prompt: "${INPUT_PROMPT}" (${tokens.length} tokens)`);
      timeStart();
      res = await wllama.decode(tokens, {});
      elapsed = timeEnd();
      print(`Time taken ${elapsed} ms, speed = ${Math.floor((tokens.length/elapsed)*1000)}t/s`);
      const N_PREDICT = 10;
      print(`Generate next ${N_PREDICT} tokens:`);
      async function genNTokens() {
        for (let i = 0; i < N_PREDICT; i++) {
          let { piece, token } = await wllama.samplingSample();
          print(`Token ID ${token} => "${textDecoder.decode(piece)}"`);
          await wllama.decode([token], {});
          tokens.push(token);
        }
      }
      timeStart();
      await genNTokens();
      elapsed = timeEnd();
      print(`Time taken ${elapsed} ms, speed = ${Math.floor((N_PREDICT/elapsed)*1000)}t/s`);

      print(`\n--------------\n`);

      const NEW_TEMPERATURE = 0.9;
      print(`Change sampling temperature to ${NEW_TEMPERATURE}`);
      await wllama.samplingInit(
        {
          ...CONFIG_SAMPLING,
          temp: NEW_TEMPERATURE, // set new temperature
        },
        tokens // don't forget to give it all the past tokens, since we're creating a new sampling context
      );
      print(`Generate next ${N_PREDICT} tokens with temperature ${NEW_TEMPERATURE}`);
      await genNTokens();

      print(`\n--------------\n`);

      print(`Get logits of next token:`);
      const logits = await wllama.getLogits(10);
      for (const candidate of logits) {
        const word = textDecoder.decode(await wllama.detokenize([candidate.token]));
        print(`Token ID ${candidate.token}, probability = ${candidate.p} ==> "${word}"`);
      }

      print(`\n--------------\n`);

      const wllamaSplits = new Wllama(CONFIG_PATHS);

      print(`Loading split model...`);
      timeStart();
      await wllamaSplits.loadModelFromUrl(MODEL_SPLITS, {
        embeddings: true,
        n_ctx: 1024,
        parallelDownloads: 5,
        progressCallback: ({ loaded, total }) => console.log(`Downloading... ${Math.round(loaded/total*100)}%`),
      });
      print(`Loaded, take ${timeEnd()} ms`);
      print(`Test generating text...`);
      const output = await wllamaSplits.createCompletion(TEXT_TO_TOKENIZE, {
        nPredict: 20,
      });
      print(output);

      // const debugInfo = await wllamaSplits._getDebugInfo();
      // console.log(debugInfo)
      // print(JSON.stringify(debugInfo, null, 2));
    }

    /////////////////////////////////////////////////////////////////////

    const elemOutput = document.getElementById('output');
    function print(message, bold) {
      const elem = document.createElement('div');
      if (bold) {
        const b = document.createElement('b');
        b.innerText = message;
        elem.appendChild(b);
      } else {
        elem.innerText = message;
      }
      elemOutput.appendChild(elem);
      // scroll to bottom
      setTimeout(() => window.scrollTo({
        top: document.documentElement.scrollHeight - window.innerHeight,
        left: 0,
        behavior: 'smooth',
      }), 10);
    }
    let __startTime = 0;
    function timeStart() {
      __startTime = Date.now();
    }
    function timeEnd() {
      return Date.now() - __startTime;
    }

    main();
  </script>
</body>
</html>