<a href="https://colab.research.google.com/github/player1537/Train-Bloom-560m/blob/main/Experiment_with_OpenOrca_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Install Dependencies
%%script bash
TMPDIR=${TMPDIR:-${TMP:-/tmp}}
TMPOUT=${TMPDIR:?}/pip.text
INSTALL=(
  transformers
  torch
  datasets
  tqdm
  accelerate
  peft
  huggingface_hub
  guidance
  langchain
  diffusers
)

if ! pip install --upgrade "${INSTALL[@]}" &>"${TMPOUT:?}"; then
  cat "${TMPOUT:?}" >&2
  exit 1
fi

In [None]:
#@title HuggingFace Login
import huggingface_hub
huggingface_hub.notebook_login()

In [None]:
#@title Import & Utilities
from __future__ import annotations

class AutoImportError(ImportError):
    pass

class auto(object):
    registry: ClassVar[Dict[str, Tuple[str, ...]]] = {}

    @classmethod
    def register(
        cls,
        import_name: str,
        package_name: Optional[str]=None,
        *extra_package_names: List[str],
    ):
        if package_name is None:
            package_name = import_name

        cls.registry[import_name] = (
            package_name,
            *extra_package_names,
        )

    def __getattr__(self, import_name: str):
        import subprocess, importlib, sys

        try:
            return object.__getattribute__(self, import_name)
        except AttributeError:
            pass

        module = None
        try:
            module = importlib.import_module(import_name)
        except ImportError as e:
            package_names = self.registry[import_name]

            process = subprocess.run([
                sys.executable,
                '-m', 'pip',
                'install',
                *package_names,
            ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)

            if process.returncode != 0:
                raise AutoImportError(f"Failed to pip install {package_names!r}\n\n{process.stdout.read()}") from e

            try:
                module = importlib.import_module(import_name)
            except ImportError as e:
                raise AutoImportError(f'Import failed a second time, even after a pip install') from e

        assert module is not None
        # print(f'setattr({self!r}, {import_name!r}, {module!r})')
        setattr(self, import_name, module)
        return module

auto.register('tqdm')
auto.register('more_itertools', 'more-itertools')
auto.register('torch')
auto.register('peft')
auto.register('guidance')
auto.register('langchain')
auto.register('diffusers')

auto.register('transformers', None, 'transformers', 'accelerate', 'datasets', 'tokenizers', 'evaluate', 'huggingface_hub', 'torch')
auto.register('accelerate', None, 'transformers', 'accelerate', 'datasets', 'tokenizers', 'evaluate', 'huggingface_hub', 'torch')
auto.register('datasets', None, 'transformers', 'accelerate', 'datasets', 'tokenizers', 'evaluate', 'huggingface_hub', 'torch')
auto.register('tokenizers', None, 'transformers', 'accelerate', 'datasets', 'tokenizers', 'evaluate', 'huggingface_hub', 'torch')
auto.register('evaluate', None, 'transformers', 'accelerate', 'datasets', 'tokenizers', 'evaluate', 'huggingface_hub', 'torch')
auto.register('huggingface_hub', None, 'transformers', 'accelerate', 'datasets', 'tokenizers', 'evaluate', 'huggingface_hub', 'torch')

auto = auto()


def doctest(func=None, /, verbose=False, sterile=False):
    def wrapper(func):
        # Thanks https://stackoverflow.com/a/49659927
        import doctest, copy

        # I need this to error out on failure; the default one doesn't.
        def run_docstring_examples(f, globs, verbose=False, name="NoName", compileflags=None, optionflags=0):
            finder = doctest.DocTestFinder(verbose=verbose, recurse=False)
            runner = doctest.DocTestRunner(verbose=verbose, optionflags=optionflags)
            for test in finder.find(func, name, globs=globs):
                runner.run(test, compileflags=compileflags)
            assert runner.failures == 0

        name = func.__name__

        if sterile:
            globs = {}
        else:
            globs = copy.copy(globals())
        globs[name] = func
        run_docstring_examples(func, globs, verbose=verbose, name=name)
        return func

    if func is not None:
        return wrapper(func)
    else:
        return wrapper

try:
    g
except NameError:
    g = {}

def run(func=None, /, name=None, cond=True, splat=False, after=None):
    def wrapper(func, /, *, name=name, cond=cond):
        import inspect

        if callable(cond):
            cond = cond()

        if not cond:
            return None

        if name is None:
            name = func.__name__

        args = []
        for key, parameter in inspect.signature(func).parameters.items():
            if parameter.kind == inspect.Parameter.POSITIONAL_ONLY:
                value = g[key]
                args.append(value)

        ret = func(*args)

        if callable(after):
            after(ret)

        if splat:
            it = ret.items()
        else:
            it = [(name, ret)]

        for name, ret in it:
            g[name] = ret

        return None

    if func is not None:
        return wrapper(func)
    else:
        return wrapper

@auto.IPython.core.magic.register_line_magic
@auto.IPython.core.magic.register_cell_magic
def source(magic_line, magic_cell=None):
    import os, subprocess, shlex

    if magic_cell is None or magic_cell == '':
        before = os.environ.copy()

        process = subprocess.run([
            'bash', '-c', f'source {magic_line}; export',
        ], capture_output=True, text=True)

        after = {}
        for line in process.stdout.split('\n'):
            if line == '': continue
            parts = shlex.split(line)
            assert parts[0] == 'declare', f'{line=!r}'
            assert parts[1] == '-x', f'{line=!r}'
            if '=' not in parts[2]: continue
            name, value = parts[2].split('=', 1)

            if before.get(name, None) == value: continue
            after[name] = value

        magic_cell = f'%%source {magic_line}\n'
        magic_cell += f'os.environ |= {{\n'
        for name, value in after.items():
            magic_cell += f'  {name!r}: '
            if ':' in value:
                magic_cell += f'":".join([\n'
                for value in value.split(':'):
                    magic_cell += f'    {value!r},\n'
                magic_cell += f'  ]),\n'
            else:
                magic_cell += f' {value!r},\n'
        magic_cell += f'}}\n'

        get_ipython().set_next_input(magic_cell, replace=True)

    get_ipython().run_cell(magic_cell)

In [None]:
@run
def repository():
    repository = auto.huggingface_hub.Repository(
        local_dir='Open-Orca--OpenOrca',
        clone_from='Open-Orca/OpenOrca',
        repo_type='dataset',
        revision='67f8c7a87db4fe0b985f317f7330c664e8122cfe',
        skip_lfs_files=True,
    )

    repository.git_checkout('mine', create_branch_ok=True)

    repository.git_pull(lfs=True)

    return repository
    # return auto.datasets.load_dataset(
    #     'Open-Orca/OpenOrca',
    #     split='train',
    #     # streaming=True,
    # )

Cloning https://huggingface.co/datasets/Open-Orca/OpenOrca into local empty directory.
Checked out 67f8c7a87db4fe0b985f317f7330c664e8122cfe from HEAD.

Revision `mine` does not exist. Created and checked out branch `mine`.



In [None]:
!ls -lahR Open-Orca--OpenOrca/*

-rw-r--r-- 1 root root 6.8K Jul  3 16:15 Open-Orca--OpenOrca/README.md

Open-Orca--OpenOrca/001-1M-GPT4-Augmented:
total 1.8G
drwxr-xr-x 2 root root 4.0K Jul  3 16:16 .
drwxr-xr-x 5 root root 4.0K Jul  3 16:15 ..
-rw-r--r-- 1 root root 269M Jul  3 16:15 1M-GPT4-Augmented-test.jsonl
-rw-r--r-- 1 root root 1.5G Jul  3 16:16 1M-GPT4-Augmented-train.jsonl
-rw-r--r-- 1 root root   72 Jul  3 16:15 README.md

Open-Orca--OpenOrca/002-3_5M-GPT3_5-Augmented:
total 5.4G
drwxr-xr-x 2 root root 4.0K Jul  3 16:17 .
drwxr-xr-x 5 root root 4.0K Jul  3 16:15 ..
-rw-r--r-- 1 root root 819M Jul  3 16:15 3_5M-GPT3_5-Augmented-test.jsonl
-rw-r--r-- 1 root root 4.6G Jul  3 16:18 3_5M-GPT3_5-Augmented-train.jsonl
-rw-r--r-- 1 root root   76 Jul  3 16:15 README.md


In [None]:
print(f'{g["repository"].local_dir!r}')

'/content/Open-Orca--OpenOrca'


In [None]:
@run
def __(repository, /):
    def quantize(x):
        return 100 * auto.math.ceil(x / 100)

    with auto.contextlib.ExitStack() as stack:
        it = Path(repository.local_dir)
        it = [
            it / '001-1M-GPT4-Augmented' / '1M-GPT4-Augmented-train.jsonl',
            it / '002-3_5M-GPT3_5-Augmented' / '3_5M-GPT3_5-Augmented-train.jsonl',
        ]
        it = (open(x) for x in it)
        it = (stack.enter_context(x) for x in it)
        it = (iter(x) for x in it)
        it = auto.itertools.chain.from_iterable(it)
        it = (auto.json.loads(x) for x in it)
        it = ((x['question'], x['response']) for x in it)

        counter = auto.collections.Counter()
        for question, response in it:
            counter.update([
                (quantize(len(question)), quantize(len(response))),
            ])

        auto.pprint.pp(counter.most_common(10))

        def quantize(x):
            if x >= 0:
                return math.ceil(x)
            else:
                return math.floor(x)

        it = counter.items()

        counter = auto.collections.Counter()
        for (inp, out), count in it:
            counter[quantize(auto.math.log2(out / inp))] += count

        for x, count in counter.most_common():
            x = (
                f'out = inp * {2**abs(x)}'
                if x > 0 else
                f'out = inp / {2**abs(x)}'
            )
            print(f'{x}: {count}')

        it = counter.items()

        counter = auto.collections.Counter()
        for exp, count in it:
            exp = (
                'pos'
                if exp > 0 else
                'neg'
            )
            counter[exp] += count

        auto.pprint.pprint(counter.most_common())

[((200, 200), 117377),
 ((200, 100), 88542),
 ((300, 100), 83693),
 ((300, 300), 75901),
 ((300, 200), 75808),
 ((100, 100), 74348),
 ((2300, 100), 69978),
 ((300, 400), 57935),
 ((400, 100), 56362),
 ((300, 500), 52352)]
out = inp * 2: 509569
out = inp / 2: 500691
out = inp / 4: 423258
out = inp / 8: 377452
out = inp * 4: 376185
out = inp / 1: 361408
out = inp / 16: 302441
out = inp / 32: 296885
out = inp * 8: 249792
out = inp * 16: 108573
out = inp * 32: 34667
out = inp / 64: 33386
out = inp / 128: 18135
out = inp * 64: 4726
out = inp / 256: 4408
out = inp / 512: 132
out = inp * 128: 8
out = inp * 256: 1
[('neg', 2318196), ('pos', 1283521)]


In [None]:
@run
def tokenizer():
    return auto.transformers.AutoTokenizer.from_pretrained(
        'bigscience/bloom-560m',
        add_prefix_space=True,
    )

Downloading (…)okenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [None]:
@run
def __(repository, tokenizer, /):
    with auto.contextlib.ExitStack() as stack:
        it = Path(repository.local_dir)
        it = [
            it / '001-1M-GPT4-Augmented' / '1M-GPT4-Augmented-train.jsonl',
            it / '002-3_5M-GPT3_5-Augmented' / '3_5M-GPT3_5-Augmented-train.jsonl',
        ]
        it = (open(x) for x in it)
        it = (stack.enter_context(x) for x in it)
        it = (iter(x) for x in it)
        it = auto.itertools.chain.from_iterable(it)
        it = (auto.json.loads(x) for x in it)
        it = ((x['question'], x['response']) for x in it)
        it = (map(tokenizer.tokenize, x) for x in it)
        it = (map(len, x) for x in it)
        it = (q+r for q, r in it)
        it = (auto.math.ceil(auto.math.log2(x)) for x in it)
        it = auto.collections.Counter(it)
        auto.pprint.pp(it.most_common(20))

KeyboardInterrupt: ignored

In [None]:
@run
def __(repository, tokenizer, /):
    with auto.contextlib.ExitStack() as stack:
        it = Path(repository.local_dir)
        it = [
            it / '001-1M-GPT4-Augmented' / '1M-GPT4-Augmented-train.jsonl',
            it / '002-3_5M-GPT3_5-Augmented' / '3_5M-GPT3_5-Augmented-train.jsonl',
        ]
        it = (open(x) for x in it)
        it = (stack.enter_context(x) for x in it)
        it = (iter(x) for x in it)
        it = auto.itertools.chain.from_iterable(it)
        it = (auto.json.loads(x) for x in it)

        for x in it:


In [None]:
@run(after=auto.pprint.pp)
def __tokenize_representative_samples(repository, tokenizer, /):
    with auto.contextlib.ExitStack() as stack:
        it = auto.pathlib.Path(repository.local_dir)
        it = [
            it / '001-1M-GPT4-Augmented' / '1M-GPT4-Augmented-train.jsonl',
            it / '002-3_5M-GPT3_5-Augmented' / '3_5M-GPT3_5-Augmented-train.jsonl',
        ]
        it = (open(x) for x in it)
        it = (stack.enter_context(x) for x in it)
        it = (iter(x) for x in it)
        it = auto.itertools.chain.from_iterable(it)
        it = (auto.json.loads(x) for x in it)

        ret = {}
        for x in it:
            key = len(x['question']) + len(x['response'])
            key = auto.math.ceil(key / 100) * 100

            if key not in ret:
                value = len(tokenizer.tokenize(x['question'])) + len(tokenizer.tokenize(x['response']))
                ret[key] = value

        return ret

{900: 202,
 400: 78,
 1000: 172,
 800: 175,
 1700: 328,
 4800: 982,
 500: 112,
 700: 137,
 1500: 318,
 2100: 464,
 2200: 446,
 2300: 468,
 1300: 285,
 3300: 791,
 4000: 872,
 300: 46,
 2700: 536,
 5000: 932,
 600: 127,
 2800: 639,
 1800: 349,
 1600: 339,
 3100: 729,
 4400: 932,
 2500: 537,
 1100: 220,
 1400: 264,
 2400: 536,
 3400: 716,
 3700: 737,
 4300: 878,
 1900: 412,
 2000: 430,
 1200: 250,
 4600: 1010,
 2900: 584,
 3800: 823,
 200: 23,
 3000: 685,
 3200: 598,
 12000: 2564,
 3900: 769,
 2600: 567,
 3600: 626,
 6100: 1293,
 100: 25,
 3500: 682,
 6400: 1407,
 5300: 1170,
 13200: 2785,
 27200: 6033,
 9600: 2078,
 10700: 2460,
 9000: 1967,
 7300: 1495,
 5700: 1220,
 4200: 758,
 20000: 4054,
 4500: 852,
 7800: 1665,
 4700: 1033,
 6500: 1432,
 6200: 1354,
 4100: 880,
 7700: 1809,
 10400: 2264,
 17800: 3827,
 6000: 1209,
 38500: 8729,
 7200: 1611,
 9400: 1924,
 4900: 930,
 14700: 3317,
 8400: 1888,
 11800: 2599,
 11600: 2515,
 10900: 2244,
 10100: 2076,
 6900: 1492,
 5100: 1064,
 26900: 

In [None]:
@run(after=auto.pprint.pp)
def __make_tokenize_length_estimate(__tokenize_representative_samples, /):
    nchars_to_ntokens = __tokenize_representative_samples

    nchars = sorted(nchars_to_ntokens.keys())
    ntokens = [nchars_to_ntokens[nchar] for nchar in nchars]
    result = auto.scipy.stats.linregress(
        nchars,
        ntokens,
    )

    print(f'ntoken = {result.slope:0.3f} * nchar + {result.intercept:0.3f} (r={result.rvalue})')

    for nchar in range(128, 2048+1, 128):
        ntoken = result.slope * nchar + result.intercept
        print(f'{nchar}c / {ntoken}t')

    return result

# ntoken = 0.210 * nchar + 64.846 (r=0.9688675985486835)
# 128c / 91.71310525423914t
# 256c / 118.58018080697065t
# 384c / 145.44725635970215t
# 512c / 172.31433191243366t
# 640c / 199.18140746516517t
# 768c / 226.04848301789667t
# 896c / 252.91555857062818t
# 1024c / 279.7826341233597t
# 1152c / 306.6497096760912t
# 1280c / 333.5167852288227t
# 1408c / 360.3838607815542t
# 1536c / 387.2509363342857t
# 1664c / 414.11801188701725t
# 1792c / 440.9850874397487t
# 1920c / 467.8521629924802t
# 2048c / 494.71923854521174t
# LinregressResult(slope=0.2098990277557149, intercept=64.84602970150763, rvalue=0.9688675985486835, pvalue=1.141652448319941e-230, stderr=0.00276242112546463, intercept_stderr=60.99909831825345)

ntoken = 0.210 * nchar + 64.846 (r=0.9688675985486835)
128c / 91.71310525423914t
256c / 118.58018080697065t
384c / 145.44725635970215t
512c / 172.31433191243366t
640c / 199.18140746516517t
768c / 226.04848301789667t
896c / 252.91555857062818t
1024c / 279.7826341233597t
1152c / 306.6497096760912t
1280c / 333.5167852288227t
1408c / 360.3838607815542t
1536c / 387.2509363342857t
1664c / 414.11801188701725t
1792c / 440.9850874397487t
1920c / 467.8521629924802t
2048c / 494.71923854521174t
LinregressResult(slope=0.2098990277557149, intercept=64.84602970150763, rvalue=0.9688675985486835, pvalue=1.141652448319941e-230, stderr=0.00276242112546463, intercept_stderr=60.99909831825345)
