In [3]:
# This notebook loads a model output from train-models and executes it on a real repository

# A list of repo names
REPO_NAMES = ["apache/tomcat", "dbeaver/dbeaver", "hortonworks/cloudbreak",
              "kiegroup/kie-tools", "apereo/cas", "jenkinsci/jenkins",
              "bazelbuild/bazel", "quarkusio/quarkus", "apache/camel", "vespa-engine/vespa",
              "bakdata/conquery", "project-ncl/bacon"]
FILE_EXTENSION = ".java"

DEBUG = True

MASTER_PROJECT_REPO_URL = 'https://github.com/pelmers/llms-for-code-comment-consistency.git'

USE_WANDB = True
WANDB_KEY = '1a0427f55873ebb00be03054c1dc8e4fee78a637'

# Read from the wandb run page, e.g. codegen_sz350M_bs8_lr1e-5_epochs10_len128
# WANDB_RUN_NAME = ""
RUN_NAME = "codebert_szbase_bs6_lr1e-05_len512_langpy_tune_extras"
PREC_OR_F1 = "f1"
# None to load from file, run path format: <username>/<project-name>/<run-id>
# WANDB_RESTORE_RUN_PATH = "pelmers/codegen-model-master/4qrh993h"
BATCH_SIZE = 8

# If running as a script, allow os.environ to overwrite these options
if __name__ == '__main__':
    import os
    for k, v in os.environ.items():
        if k in globals():
            # First check if v is a boolean or a number and convert to the right type
            if v.lower() == 'true':
                v = True
            elif v.lower() == 'false':
                v = False
            elif v.isnumeric():
                v = int(v)
            # Or a float
            elif '.' in v and v.replace('.', '').isnumeric():
                v = float(v)
            # Or a list
            elif v.startswith('[') and v.endswith(']'):
                v = v[1:-1].split(',')
                v = [a.strip() for a in v]
            globals()[k] = v

if DEBUG:
    USE_WANDB = False

CHECKPOINT_FILE = f"checkpoints/ckpt_{RUN_NAME}_best_{PREC_OR_F1}.pt"

import sys, subprocess, os

# Define function x that given a command string, runs it with subprocess and streams the output
def x(cmd):
    return subprocess.run(cmd.split(" ")).returncode

In [None]:
# If ubuntu then install build-essential
if x('uname') == 0:
    x('sudo apt-get install -y build-essential')

# Install pip packages
assert x('pip install -U tree_sitter pydriller wandb transformers tokenizers scikit-learn matplotlib') == 0
if os.getenv("COLAB_RELEASE_TAG"):
    assert x('pip install -U matplotlib==3.1.3') == 0

In [4]:
# clone repo if this is just the notebook file (current folder is not 'rq1'), then cd to the cloned repo
if not os.path.abspath(os.getcwd()).endswith('rq4'):
    # If llms-for-code-comment-consistency exists, then go in and pull any updates
    if os.path.exists('llms-for-code-comment-consistency'):
        os.chdir('llms-for-code-comment-consistency')
        # Update if already exists
        try:
            assert x('git pull origin main --ff-only') == 0
        except AssertionError:
            # old version of git doesn't support --ff-only
            assert x('git pull origin main') == 0
    else:
        assert x(f'git clone {MASTER_PROJECT_REPO_URL}') == 0
        os.chdir('llms-for-code-comment-consistency')
else:
    # cd to parent of this folder for the root of the repo
    os.chdir(os.path.dirname(os.getcwd()))

sys.path.append('lib')

In [None]:
import wandb

if USE_WANDB:
    wandb.login(key=WANDB_KEY)
    run = wandb.init(project=f"execute-model-master")
else:
    wandb.init(project=f"execute-model-master", mode="disabled")

In [5]:
# Model loading
from models import get_model

import torch

print('Loading previous checkpoint...')

print(f'Loading from local file {CHECKPOINT_FILE}...')
prev_state = torch.load(CHECKPOINT_FILE)

model_type = prev_state['model_type']
model_size = prev_state['model_size']
max_length = prev_state['max_length']

print('Preparing model state...')
my_model, tokenizer, config = get_model(model_type, model_size)

my_model.load_state_dict(prev_state['model_state_dict'])
start_epoch = prev_state['epoch']
positive_threhshold = prev_state['positive_threshold']

use_cuda = torch.cuda.is_available()

if use_cuda:
    my_model.cuda()
    x('nvidia-smi')

print(f'Loaded model {model_type} {model_size} from epoch {start_epoch}, file {CHECKPOINT_FILE}')

if USE_WANDB:
    run.name = f'execute-{len(REPO_NAMES)}-{FILE_EXTENSION}-repos-model-{model_type}-{model_size}-{max_length}'
    run.notes = f'''
    Model path: {CHECKPOINT_FILE}
    Repo list: {str(REPO_NAMES)}
    '''

KeyboardInterrupt: 

In [None]:
def clone_repo(repo_name, repo_folder):
    assert(x(f'mkdir -p {repo_folder}') == 0)
    assert(x(f'rm -rf {repo_folder}') == 0)
    repo_url = f'https://github.com/{repo_name}.git'
    assert(x(f'git clone {repo_url} {repo_folder} --depth=1') == 0)

In [None]:
from tqdm import tqdm

import json

from parsers import parse_entire_directory, create_dataset_example_object
from data import dataset_from_file, get_collate_fn

collate_fn = get_collate_fn(tokenizer)

def parse_repo_to_dataset(repo_name, repo_folder):
    print(f'Parsing all {FILE_EXTENSION} files in {repo_folder}...')
    repo_examples = parse_entire_directory(repo_folder, FILE_EXTENSION)
    # Add a 'github_url' field to each example
    for ex in repo_examples:
        ex['github_url'] = f'https://github.com/{repo_name}/blob/master/{ex["path"]}#L{ex["comment_start_line"] + 1}'
    dataset_examples = [create_dataset_example_object({'label': 0}, ex, ex) for ex in repo_examples]
    fname = f"repos/{repo_name.replace('/', '_')}.examples.json"
    with open(fname, 'w') as f:
        json.dump(dataset_examples, f, indent=2)

    print(f'Examples saved to {fname}, now loading them into a dataset...')

    _, dataset = dataset_from_file(fname, model_type, tokenizer, max_length)
    return repo_examples, dataset

In [None]:
import numpy as np
from torch.utils.data import Dataset, DataLoader, TensorDataset

def execute_model_dataset(dataset):
    # Put the dataset in a dataloader and execute the model, final statistics
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    print('Predicting which examples are positive...')

    my_model.eval()
    predictions = []
    probs = []
    for batch in tqdm(dataloader):
        input_ids, attention_mask, labels, _ = batch
        if torch.cuda.is_available():
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            labels = labels.cuda()
        with torch.no_grad():
            prob, loss, _ = my_model(input_ids, attention_mask, labels)
            loss += loss.item()
            # Predict based on positive_threshold
            pred = prob[:, 1] > positive_threhshold
            predictions.append(pred.cpu().numpy())
            probs.append(prob.cpu().numpy()[:, 1])
    predictions = np.concatenate(predictions).tolist()
    probs = np.concatenate(probs).tolist()

    # Print final statistics
    print()
    print(f"Number of examples: {len(predictions)}")
    print(f"Number of positives: {sum(predictions)}")
    return predictions, probs


IndentationError: expected an indented block (620150245.py, line 5)

In [None]:
# Save the results to a file, and print all examples

def process_results(repo_name, predictions, probs, repo_examples):
    results = [{'prediction': prediction, 'prob': prob, 'example': example} for prediction, prob, example in zip(predictions, probs, repo_examples)]
    # Thesis: some comments might be copy-pasted, so group those together, maybe one of them is wrong
    results_by_comment = {}
    for result in results:
        comment = result['example']['comment_summary']
        if len(comment) == 0:
            continue
        if comment not in results_by_comment:
            results_by_comment[comment] = []
        results_by_comment[comment].append(result)
    grouped_results = []
    ungrouped_results = []
    for comment, results in results_by_comment.items():
        results = sorted(results, key=lambda x: x['prob'], reverse=True)
        name_set = {result['example']['qualified_name'].split('.')[-1] for result in results}
        if len(results) > 1 and len(name_set) > 1:
            grouped_results.extend(results)
        else:
            ungrouped_results.extend(results)
    positives = [result for result in ungrouped_results if result['prediction'] == True]
    negatives = [result for result in ungrouped_results if result['prediction'] == False]
    # Sort by seconds_since_last_commit in ascending order, with positives first
    results = grouped_results + sorted(positives, key=lambda x: x['example']['seconds_since_last_commit']) + sorted(negatives, key=lambda x: x['example']['seconds_since_last_commit'])

    results_name = f'results_{repo_name.replace("/", "#")}.json'

    import json
    with open(results_name, 'w') as f:
        json.dump(results, f)
        print(f'Saved results to {results_name}...')

    if USE_WANDB:
        wandb.save(results_name)

    print('Printing the most positive examples:')
    to_print = 10
    from pprint import pprint
    for result in sorted(results, key=lambda x: x['prob'], reverse=True)[:to_print]:
        pprint(result)

IndentationError: expected an indented block (2052471909.py, line 10)

In [None]:
import tempfile

x('mkdir -p repos')

for repo_name in REPO_NAMES:
    print(f'Processing repo {repo_name}...')
    # Clone into a temporary folder
    with tempfile.TemporaryDirectory() as tmp:
        clone_repo(repo_name, tmp)
        examples, dataset = parse_repo_to_dataset(repo_name, tmp)
        if len(examples) == 0:
            print(f'No examples found for {repo_name}, skipping...')
            continue
        predictions, probs = execute_model_dataset(dataset)
        process_results(repo_name, predictions, probs, examples)

In [None]:
wandb.finish()
print('Done!')