In [1]:
import sys, subprocess, os

# Define function x that given a command string, runs it with subprocess and streams the output
def x(cmd):
    return subprocess.run(cmd.split(" ")).returncode

FOLDER_NAME = 'deep-jit-inconsistency-detection'
DATA_PATH = 'data/Java-22k'

ADDITIONAL_DATA_PATHS = ['data/benchmarks/java.json', 'data/Go-22k/test.json', 'data/Python-22k/test.json', 'data/JS-22k/test.json', 'data/Java-22k/test.json', 'data/benchmarks/python.json', 'data/benchmarks/go.json', 'data/benchmarks/js.json']

MY_REPO_URL = 'https://github.com/pelmers/llms-for-code-comment-consistency.git'

# If running as a script, allow os.environ to overwrite these options
if __name__ == '__main__':
    import os
    for k, v in os.environ.items():
        if k in globals():
            # First check if v is a boolean or a number and convert to the right type
            if v.lower() == 'true':
                v = True
            elif v.lower() == 'false':
                v = False
            elif v.isnumeric():
                v = int(v)
            # Or a float
            elif '.' in v and v.replace('.', '').isnumeric():
                v = float(v)
            # Or a list
            elif v.startswith('[') and v.endswith(']'):
                v = v[1:-1].split(',')
                v = [a.strip() for a in v]
            globals()[k] = v

# If we're not in my repo (../.git exists?), then clone it
if os.path.exists('../.git'):
    data_rel = '../'
else:
    if not os.path.exists('llms-for-code-comment-consistency'):
        assert x(f'git clone {MY_REPO_URL}') == 0
    data_rel = 'llms-for-code-comment-consistency/'

DATA_FOLDER = os.path.join(FOLDER_NAME, 'res', 'data')
if not os.path.exists(FOLDER_NAME):
    x('git clone -b pelmers-replication https://github.com/pelmers/deep-jit-inconsistency-detection.git')

DATA_ARCHIVE = os.path.join(data_rel, DATA_PATH)

def ensure_data_archive(data_path):
    # If the data folder does not exist, extract it from the .tar.gz file
    folder_path = os.path.join(data_rel, data_path)
    if not os.path.exists(folder_path):
        print(f'Ensuring data exists at {folder_path}.tar.gz')
        archive_path = folder_path + '.tar.gz'
        archive_data_path = data_path + '.tar.gz'
        if not os.path.exists(archive_path):
            print(f'Downloading data for {archive_data_path} from server')
            assert x(f'wget -O {archive_path} https://file2.pelmers.com/{archive_data_path}') == 0


def extract_archive(archive_path):
    print(f'Extracting data from {archive_path} to {DATA_FOLDER}...')
    x('mkdir -p {}'.format(DATA_FOLDER))
    assert x('tar -xvzf {} -C {}'.format(archive_path, DATA_FOLDER)) == 0


for data_path in [DATA_PATH] + [os.path.dirname(p) for p in ADDITIONAL_DATA_PATHS]:
    ensure_data_archive(data_path)

x('sudo apt-get install build-essential nvidia-cuda-dev --yes')
x('pip install dpu_utils pycocoevalcap torch_scatter nltk')
import nltk
nltk.download('stopwords')

RUN_LANGUAGE = 'java'
RUN_LANGUAGE = 'go' if 'go' in DATA_PATH.lower() else RUN_LANGUAGE
RUN_LANGUAGE = 'py' if 'python' in DATA_PATH.lower() else RUN_LANGUAGE
RUN_LANGUAGE = 'js' if 'javascript' in DATA_PATH.lower() or 'js' in DATA_PATH.lower() else RUN_LANGUAGE

Ensuring data exists at ../notebooks/data/Java-22k.tar.gz
Ensuring data exists at ../notebooks/data/benchmarks.tar.gz
Ensuring data exists at ../notebooks/data/Go-22k.tar.gz
Ensuring data exists at ../notebooks/data/Python-22k.tar.gz
Ensuring data exists at ../notebooks/data/Java-22k.tar.gz
Ensuring data exists at ../notebooks/data/benchmarks.tar.gz
Ensuring data exists at ../notebooks/data/benchmarks.tar.gz
Ensuring data exists at ../notebooks/data/benchmarks.tar.gz
Reading package lists...
Building dependency tree...
Reading state information...
build-essential is already the newest version (12.9ubuntu3).
nvidia-cuda-dev is already the newest version (11.5.1-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 90 not upgraded.


[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
sys.path.append(FOLDER_NAME)
sys.path.append(os.path.join(FOLDER_NAME, 'comment_update'))

from run_comment_model import build_model, train, evaluate, ModuleManager
from data_loader import read_diff_examples_from_file

  from .autonotebook import tqdm as notebook_tqdm


Failed parsing: Summary


In [3]:
import json

task = 'detect'
manager = ModuleManager(
    attend_code_sequence_states=True,
    attend_code_graph_states=False,
    features=False,
    posthoc=False,
    task=task
)

def load_examples(data_path):
    data_foldername = os.path.dirname(data_path).split('/')[-1]
    data_filename = os.path.basename(data_path)
    examples = read_diff_examples_from_file(f'{DATA_FOLDER}/{data_foldername}/{data_filename}')
    import random
    random.shuffle(examples)
    return examples

extract_archive(os.path.join(data_rel, DATA_PATH + '.tar.gz'))
train_examples = load_examples(DATA_PATH + '/train.json')
valid_examples = load_examples(DATA_PATH + '/valid.json')
test_examples = load_examples(DATA_PATH + '/test.json')

import torch
if torch.cuda.is_available():
    x('nvidia-smi')

Attend code sequence states: True
Attend code graph states: False
Features: False
Task: detect
Extracting data from ../notebooks/data/Java-22k.tar.gz to deep-jit-inconsistency-detection/res/data...
Java-22k/
Java-22k/extras.json
Java-22k/test.json
Java-22k/train.json
Java-22k/valid.json
Java-22k/metadata.json


In [4]:
manager.initialize(train_examples)
model_save_path = f'{FOLDER_NAME}/detect-{RUN_LANGUAGE}-{DATA_PATH.split("/")[-1]}.pth'
print(f'Creating model, will save to {model_save_path}')
model = build_model(task, model_save_path, manager)

NL vocabulary size: 29
Code vocabulary size: 29
Using 28 pre-trained NL embeddings
Using 28 pre-trained code embeddings
Creating model, will save to deep-jit-inconsistency-detection/detect-java-data.pth


In [5]:
train(model, train_examples, valid_examples)

Training with 17600 examples (validation 2200)


In [None]:

NEGATIVE_TO_POSITIVE_RATIO = {
    'java': 19,
    'go': 19,
    'py': 19,
    'js': 19,
}

# this weight is used for the weighted f1 score
model.negative_class_weight = NEGATIVE_TO_POSITIVE_RATIO[RUN_LANGUAGE]
print(f'Using class weight {model.negative_class_weight}')
evaluate(task, model, test_examples, 'deepjit_detect_model', rerank=False)

Using class weight 17.35
Evaluating 20 examples
Testing batch 0/1
True positives: 20.0
False positives: 0.0
True negatives: 0.0
False negatives: 0.0
Precision: 1.0
Recall: 1.0
F1: 1.0
Weighted F1: 1.0
Accuracy: 1.0



In [None]:
print('Evaluating additional test paths')

for path in ADDITIONAL_DATA_PATHS:
    pfne = os.path.basename(path).split('.')[0].lower()
    print(f'Evaluating {path}')
    archive_path = os.path.join(data_rel, os.path.dirname(path) + '.tar.gz')
    extract_archive(archive_path)
    data_extracted_folder = os.path.join(DATA_FOLDER, os.path.dirname(path).split('/')[-1])
    test_examples = load_examples(path)
    test_language = 'py' if 'python' in pfne else 'go' if 'go' in pfne else 'js' if 'js' in pfne else 'java'

    model.negative_class_weight = NEGATIVE_TO_POSITIVE_RATIO[test_language]
    print(f'Using class weight {model.negative_class_weight} for {test_language}')
    evaluate(task, model, test_examples, 'deepjit_detect_model', rerank=False)

Evaluating additional test paths
Evaluating data/benchmarks/java.json
Extracting data from ../notebooks/data/benchmarks.tar.gz to deep-jit-inconsistency-detection/res/data...
benchmarks/
benchmarks/js.json
benchmarks/python.json
benchmarks/java.json
benchmarks/go.json
Using class weight 17.35
Evaluating 20 examples
Testing batch 0/1
True positives: 10.0
False positives: 10.0
True negatives: 0.0
False negatives: 0.0
Precision: 0.5
Recall: 1.0
F1: 0.6666666666666666
Weighted F1: 0.10335917312661501
Accuracy: 0.5

Evaluating data/Go-22k/test.json
Extracting data from ../notebooks/data/Go-22k.tar.gz to deep-jit-inconsistency-detection/res/data...
Go-22k/
Go-22k/train.json
Go-22k/metadata.json
Go-22k/extras.json
Go-22k/valid.json
Go-22k/test.json
Using class weight 17.35
Evaluating 20 examples
Testing batch 0/1
True positives: 20.0
False positives: 0.0
True negatives: 0.0
False negatives: 0.0
Precision: 1.0
Recall: 1.0
F1: 1.0
Weighted F1: 1.0
Accuracy: 1.0

Evaluating data/Python-22k/test.