# Loading datasets and model

In [None]:
!pip install datasets transformers[torch] beartype jaxtyping

In [2]:
%load_ext autoreload
%autoreload 2

In [7]:
import os
import shutil
import sys
import json
import matplotlib.pyplot as plt
import numpy as np
import torch as t
import torch.nn as nn
import torch.nn.functional as F
from beartype import beartype as typed
from beartype.door import die_if_unbearable as assert_type
from numpy import ndarray as ND
from torch import Tensor as TT
from jaxtyping import Float, Int, Bool
from typing import Mapping
from tqdm import tqdm
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

In [None]:
train_data = load_dataset("bigcode/commitpackft", "python")
eval_data = load_dataset("bigcode/humanevalpack")
model_name = "smallcloudai/Refact-1_6B-fim"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Exploring `CommitPackFt`

Here is an example of the fields included in the dataset.

In [60]:
print(json.dumps(train_data["train"][0], indent=2))


{
  "commit": "e905334869af72025592de586b81650cb3468b8a",
  "old_file": "sentry/queue/client.py",
  "new_file": "sentry/queue/client.py",
  "old_contents": "\"\"\"\nsentry.queue.client\n~~~~~~~~~~~~~~~~~~~\n\n:copyright: (c) 2010 by the Sentry Team, see AUTHORS for more details.\n:license: BSD, see LICENSE for more details.\n\"\"\"\nfrom kombu import BrokerConnection\nfrom kombu.common import maybe_declare\nfrom kombu.pools import producers\n\nfrom sentry.conf import settings\nfrom sentry.queue.queues import task_queues, task_exchange\n\n\nclass Broker(object):\n    def __init__(self, config):\n        self.connection = BrokerConnection(**config)\n\n    def delay(self, func, *args, **kwargs):\n        payload = {\n            \"func\": func,\n            \"args\": args,\n            \"kwargs\": kwargs,\n        }\n\n        with producers[self.connection].acquire(block=False) as producer:\n            for queue in task_queues:\n                maybe_declare(queue, producer.channel)\n  

Some examples of messages and corresponding changes:

In [79]:
from IPython.display import display, HTML
from difflib import HtmlDiff

for i in range(10):
    sample = train_data["train"][i]
    old_lines = sample["old_contents"].split("\n")
    new_lines = sample["new_contents"].split("\n")
    differ = HtmlDiff()
    print("Message:", sample["message"])
    print("Change:")
    display(HTML(differ.make_table(old_lines, new_lines)))


Message: Declare queues when broker is instantiated

Change:


0,1,2,3,4,5
f,1.0,"""""""",f,1.0,""""""""
,2.0,sentry.queue.client,,2.0,sentry.queue.client
,3.0,~~~~~~~~~~~~~~~~~~~,,3.0,~~~~~~~~~~~~~~~~~~~
,4.0,,,4.0,
,5.0,":copyright: (c) 2010 by the Sentry Team, see AUTHORS for more details.",,5.0,":copyright: (c) 2010 by the Sentry Team, see AUTHORS for more details."
,6.0,":license: BSD, see LICENSE for more details.",,6.0,":license: BSD, see LICENSE for more details."
,7.0,"""""""",,7.0,""""""""
,8.0,from kombu import BrokerConnection,,8.0,from kombu import BrokerConnection
,9.0,from kombu.common import maybe_declare,,9.0,from kombu.common import maybe_declare
,10.0,from kombu.pools import producers,,10.0,from kombu.pools import producers


Message: Revert "Fix openweather unit tests"

This reverts commit 36e100e649f0a337228a6d7375358d23afd544ff.

Open Weather Map has reverted back to their old api or something like that...

Change:


0,1,2,3,4,5
f,1,# -*- coding: utf-8 -*-,f,1,# -*- coding: utf-8 -*-
,2,import bot_mock,,2,import bot_mock
,3,from pyfibot.modules import module_openweather,,3,from pyfibot.modules import module_openweather
,4,from utils import check_re,,4,from utils import check_re
,5,,,5,
,6,,,6,
,7,bot = bot_mock.BotMock(),,7,bot = bot_mock.BotMock()
,8,,,8,
,9,,,9,
,10,def test_weather():,,10,def test_weather():


Message: Fix % only showing 0 or 100%, everything between goes to 0%.


Autoconverted from SVN (revision:1548)

Change:


0,1,2,3,4,5
f,1,"from django.template import Node, Library",f,1,"from django.template import Node, Library"
,2,,,2,
,3,register = Library(),,3,register = Library()
,4,,,4,
,5,@register.filter('percentage'),,5,@register.filter('percentage')
,6,"def percentage(value, total):",,6,"def percentage(value, total):"
,7,try:,,7,try:
t,8,percentage = int(value) / int(total) * 100,t,8,percentage = float(value) / float(total) * 100
,9,except ZeroDivisionError:,,9,except ZeroDivisionError:
,10,percentage = 0,,10,percentage = 0


Message: Remove "validation" from RejectionException docstring

Change:


0,1,2,3,4,5
f,1,#!/usr/bin/env python3,f,1,#!/usr/bin/env python3
,2,"""""""Exception classes shared by all automata.""""""",,2,"""""""Exception classes shared by all automata."""""""
,3,,,3,
,4,,,4,
,5,class AutomatonException(Exception):,,5,class AutomatonException(Exception):
,6,"""""""The base class for all automaton-related errors.""""""",,6,"""""""The base class for all automaton-related errors."""""""
,7,,,7,
,8,pass,,8,pass
,9,,,9,
,10,,,10,


Message: Fix implied_group, it still refers to the old module name

Change:


0,1,2,3,4,5
f,1,# -*- coding: utf-8 -*-,f,1,# -*- coding: utf-8 -*-
,2,##############################################################################,,2,##############################################################################
,3,#,,3,#
,4,# Copyright (C) 2015 Agile Business Group sagl,,4,# Copyright (C) 2015 Agile Business Group sagl
,5,# (<http://www.agilebg.com>),,5,# (<http://www.agilebg.com>)
,6,#,,6,#
,7,# This program is free software: you can redistribute it and/or modify,,7,# This program is free software: you can redistribute it and/or modify
,8,# it under the terms of the GNU Affero General Public License as published,,8,# it under the terms of the GNU Affero General Public License as published
,9,"# by the Free Software Foundation, either version 3 of the License, or",,9,"# by the Free Software Foundation, either version 3 of the License, or"
,10,# (at your option) any later version.,,10,# (at your option) any later version.


Message: Fix interpretation of parameters for names list modification

Change:


0,1,2,3,4,5
f,1,from txircd.modbase import Mode,f,1,from txircd.modbase import Mode
,2,,,2,
,3,class InvisibleMode(Mode):,,3,class InvisibleMode(Mode):
,4,"def namesListEntry(self, recipient, channel, user, representation):",,4,"def namesListEntry(self, recipient, channel, user, representation):"
t,5,"if channel not in recipient.channels and ""i"" in user.mode:",t,5,"if channel.name not in recipient.channels and ""i"" in user.mode:"
,6,"return """"",,6,"return """""
,7,return representation,,7,return representation
,8,,,8,
,9,class Spawner(object):,,9,class Spawner(object):
,10,"def __init__(self, ircd):",,10,"def __init__(self, ircd):"


Message: Fix image path in manifest

Change:


0,1,2,3,4,5
f,1,# -*- coding: utf-8 -*-,f,1,# -*- coding: utf-8 -*-
,2,# © 2013-2016 Numérigraphe SARL,,2,# © 2013-2016 Numérigraphe SARL
,3,# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).,,3,# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
,4,,,4,
,5,{,,5,{
,6,"""name"": ""Hierarchical Inventory adjustments"",",,6,"""name"": ""Hierarchical Inventory adjustments"","
,7,"""summary"": ""Group several Inventory adjustments in a master inventory"",",,7,"""summary"": ""Group several Inventory adjustments in a master inventory"","
,8,"""version"": ""8.0.2.0.0"",",,8,"""version"": ""8.0.2.0.0"","
,9,"""depends"": [""stock""],",,9,"""depends"": [""stock""],"
,10,"""author"": u""Numérigraphe,Odoo Community Association (OCA)"",",,10,"""author"": u""Numérigraphe,Odoo Community Association (OCA)"","


Message:  [FIX][11.0] Make debugger record a debug message instead of error when importing validate_email in partner_email_check

Change:


0,1,2,3,4,5
f,1,# Copyright 2019 Komit <https://komit-consulting.com>,f,1,# Copyright 2019 Komit <https://komit-consulting.com>
,2,# License AGPL-3.0 or later (https://www.gnu.org/licenses/agpl).,,2,# License AGPL-3.0 or later (https://www.gnu.org/licenses/agpl).
,3,,,3,
,4,import logging,,4,import logging
,5,"from odoo import api, models, _",,5,"from odoo import api, models, _"
,6,from odoo.exceptions import UserError,,6,from odoo.exceptions import UserError
,7,,,7,
,8,_logger = logging.getLogger(__name__),,8,_logger = logging.getLogger(__name__)
,9,,,9,
,10,try:,,10,try:


Message: Modify the author email address

Change:


0,1,2,3,4,5
f,1,#!/usr/bin/env python,f,1,#!/usr/bin/env python
,2,from distutils.core import setup,,2,from distutils.core import setup
,3,,,3,
,4,packages = [,,4,packages = [
,5,"'upho',",,5,"'upho',"
,6,"'upho.phonon',",,6,"'upho.phonon',"
,7,"'upho.harmonic',",,7,"'upho.harmonic',"
,8,"'upho.analysis',",,8,"'upho.analysis',"
,9,"'upho.structure',",,9,"'upho.structure',"
,10,"'upho.irreps',",,10,"'upho.irreps',"


Message: Change the version of the package.
Change:


0,1,2,3,4,5
f,1,,f,1,
,2,"from setuptools import setup, find_packages",,2,"from setuptools import setup, find_packages"
,3,"import sys, os",,3,"import sys, os"
,4,,,4,
t,5,version = '1.1.1',t,5,version = '1.1.2'
,6,,,6,
,7,setup(,,7,setup(
,8,"name = 'daprot',",,8,"name = 'daprot',"
,9,"version = version,",,9,"version = version,"
,10,"description = ""daprot is a data prototyper and mapper library."",",,10,"description = ""daprot is a data prototyper and mapper library."","


As can be seen in these sample, commit messages almost always don't uniquely determine the change that was made, and they are not intended to do so, because their main function is to differentiate commits inside one repository, which is much easier. So given such quality of training data, it is not very surprising that OctoCoder and other models using this dataset cannot beat even the models trained on the outputs of OpenAI models.

Actually, what they can learn at all from such messages as "revert ... this reverts commit ..." is a more interesting question. Seems like a good example for [the claim](https://aclanthology.org/2022.naacl-main.167.pdf) that prompts in instruction tuning don't work the same way as instructions for humans.

# Evaluating `Refact-1.6B`

I'm following the instructions from [Octopack repository](https://github.com/bigcode-project/octopack/tree/main#evaluation) here, though changed the way to configurate `accelerate`, because Colab doesn't allow console input.

In [None]:
!git clone https://github.com/bigcode-project/bigcode-evaluation-harness
os.chdir("bigcode-evaluation-harness")
!pip install -q -r requirements.txt
!python -c "from accelerate.utils import write_basic_config; write_basic_config(mixed_precision='fp16')"

The evaluation was very slow, so I reduced the number of `n_samples` (for beam search, probably) from $20$ to $5$.

In [6]:
!accelerate launch main.py \
--model smallcloudai/Refact-1_6B-fim  \
--tasks humanevalfixtests-python \
--do_sample True \
--temperature 0.2 \
--n_samples 4 \
--batch_size 5 \
--allow_code_execution \
--save_generations \
--trust_remote_code \
--prompt octocoder \
--save_generations_path generations_humanevalfixpython_octocoder.json \
--metric_output_path evaluation_humanevalfixpython_octocoder.json \
--max_length_generation 2048 \
--precision bf16

2023-11-13 08:52:38.818514: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-13 08:52:38.818565: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-13 08:52:38.818615: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Selected Tasks: ['humanevalfixtests-python']
Loading model in bf16
number of problems for this task is 164
100% 164/164 [45:06<00:00, 16.50s/it]
generations were saved at generations_humanevalfixpython_octocoder.json
Evaluating generations...
Downloading builder script: 100% 7.92k/7.92k [00:00<00:00, 22.6MB/s]
Downloading extra modules: 100% 13.0k/13.0k [00:00

As we see, in this setting it achieved $9\%$ pass@1. [The authors](https://huggingface.co/smallcloudai/Refact-1_6B-fim) claim $18\%$ for "pass@1 (T=0.2) on HumanEvalFixTests Python", but maybe they used different prompt/configuration. Let's inspect some generations. Most of them simply copy the buggy solution, which is not interesting, so I will filter the remaining ones.

In [65]:
from IPython.display import display, Code, HTML

@typed
def no_spaces(s: str) -> str:
    return "".join([c for c in s if not c.isspace()])

@typed
def interesting_range(lines: list[list[str]]) -> tuple[int, int]:
    min_len = len(min(lines, key=len))
    max_len = len(max(lines, key=len))
    changes: list[int] = [i for i, tp in enumerate(zip(*lines)) if len(set(tp)) > 1]
    l: int = min(changes, default=min_len)
    r: int = max(changes, default=min_len)
    if min_len != max_len:
        r = max_len
    return l, r

with open("generations_humanevalfixpython_octocoder.json", encoding="utf-8") as f:
    solutions = [x[0] for x in json.load(f)]
    for i in range(30):
        print(f"Task #{i}:", end="")
        task = eval_data["test"][i]
        buggy_baseline = task["declaration"] + task["buggy_solution"]
        correct_baseline = task["declaration"] + task["canonical_solution"]
        if no_spaces(solutions[i]) == no_spaces(buggy_baseline):
            display(HTML('<span style="background-color:pink;">Same as buggy</span>'))
            continue
        if no_spaces(solutions[i]) == no_spaces(correct_baseline):
            display(HTML('<span style="background-color:lime;">Same as correct</span>'))
            continue
        display(HTML('<span style="background-color:aqua;">Interesting!</span>'))
        print("Generated solution:")
        display(Code(data=solutions[i], language="python"))

        buggy_lines = buggy_baseline.split("\n")
        correct_lines = correct_baseline.split("\n")
        generated_lines = solutions[i].split("\n")
        l, r = interesting_range([buggy_lines, correct_lines, generated_lines])
        print("Buggy solution:")
        display(Code(data="\n".join(buggy_lines[l:r]), language="python"))
        print("Correct solution:")
        display(Code(data="\n".join(correct_lines[l:r]), language="python"))
        print()

Task #0:

Task #1:

Task #2:

Task #3:

Task #4:

Task #5:

Task #6:

Task #7:

Task #8:

Task #9:

Generated solution:


Buggy solution:


Correct solution:



Task #10:

Task #11:

Task #12:

Task #13:

Task #14:

Generated solution:


Buggy solution:


Correct solution:



Task #15:

Task #16:

Generated solution:


Buggy solution:


Correct solution:



Task #17:

Task #18:

Generated solution:


Buggy solution:


Correct solution:



Task #19:

Task #20:

Task #21:

Task #22:

Task #23:

Task #24:

Task #25:

Generated solution:


Buggy solution:


Correct solution:



Task #26:

Generated solution:


Buggy solution:


Correct solution:



Task #27:

Task #28:

Task #29:

So in a couple of tasks the model just added a docstring, maybe it was shown in evaluation, or maybe it just imagined it. Also, in some cases it decided to rewrite the solution from scratch using different approach, in one case successfully (switching from buggy use of `Counter` to `set`) and in another not (the task was to find distinct characters and instead of adding `lower()` as in correct solution the model just expanded generator expression into `for` loop). But still, in all cases, the model generated syntactically correct code, and the problem was more on the level of ideas.

# Commits as a source of instruction-tuning data

As I mentioned above, naively using commits with their messages for instruction-tuning seems like a very noisy way to create dataset, though it avoid some problems with licensing. But at least filtering commits in a stricter way should be helpful. For example, in [Textbooks Are All You Need](https://arxiv.org/pdf/2306.11644.pdf) they train a random forest based on the annotations provided by GPT-4 (which perhaps can be replaced by manual labeling) and then select data with high educational value using this random forest. Just this step alone increases pass@1 on HumanEval for 1.3B parameter model from $17\%$ to $29\%$ (note that `Refact-1.6B` gets $18\%$ according to the authors, so by simply using higher-quality data, even without instruction-tuning, they could have almost doubled this result). 

And when open-source models are or will become good enough, extracting synthethic synthethic exercises from them (where instruction completely describes what needs to be done) will be another huge boost for models with permissive licenses, again as shows in Textbooks Are All You Need ($29\% \to 51\%$ for the same 1.3B parameter model).