In [4]:
import pickle
import gzip
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import sentencepiece as spm
import textdistance
import torch

from representjs import RUN_DIR, CSNJS_DIR
from data.precomputed_dataset import PrecomputedDataset

In [5]:
DEFAULT_CSNJS_TRAIN_FILEPATH = str(CSNJS_DIR / "javascript_dedupe_definitions_nonoverlap_v2_train.jsonl.gz")
DEFAULT_SPM_UNIGRAM_FILEPATH = str(CSNJS_DIR / "csnjs_8k_9995p_unigram_url.model")

AUGMENTED_DATASET_PATH = "/data/ajay/coderep/representjs/data/codesearchnet_javascript/javascript_augmented.pickle.gz"

In [6]:
config = dict(
    train_filepath=AUGMENTED_DATASET_PATH,
    program_mode="contrastive",
    limit_dataset_size=-1,
    subword_regularization_alpha=0.1,
    # MIN ALTERNATIVES = 2, SWREG=0.1 used for BiLSTM
    # MIN ALT=1, SWREG=0.1 for contrastive Transformer  TODO: try this.
    # MIN ALT=1, SWREG=0. for hybrid MLM, contrastive Transformer
    min_alternatives=2,
    max_length=1024,
    spm_filepath=DEFAULT_SPM_UNIGRAM_FILEPATH
)

sp = spm.SentencePieceProcessor()
sp.Load(config["spm_filepath"])
pad_id = sp.PieceToId("[PAD]")
mask_id = sp.PieceToId("[MASK]")

train_dataset = PrecomputedDataset(
    config["train_filepath"],
    min_alternatives=config["min_alternatives"],
    program_mode=config["program_mode"],
    limit_size=config["limit_dataset_size"],
    sp=sp,
    subword_regularization_alpha=config["subword_regularization_alpha"],
    max_length=config["max_length"],
)

2020-11-14 14:36:56.641 | DEBUG    | data.precomputed_dataset:__init__:37 - Loading /data/ajay/coderep/representjs/data/codesearchnet_javascript/javascript_augmented.pickle.gz
2020-11-14 14:40:02.521 | DEBUG    | data.precomputed_dataset:__init__:45 - Loaded 1843099 examples in 185.877s
2020-11-14 14:40:15.088 | DEBUG    | data.precomputed_dataset:__init__:50 - Converted examples to lists of alternatives
2020-11-14 14:40:15.484 | DEBUG    | data.precomputed_dataset:__init__:53 - Filtered dataset to 1644353 examples with at least 2 alternatives


# Define negatives edit distance dataset

In [27]:
class NegEDDataset(torch.utils.data.Dataset):
    def __init__(self, precomputed_dataset, sample_size=1644353):
        super().__init__()
        self.precomputed_dataset = precomputed_dataset
        self.n = len(self.precomputed_dataset)
        self.sample_size = sample_size

    def __len__(self):
        return self.sample_size

    def __getitem__(self, idx):
        idx_a = np.random.randint(0, self.n-1)
        idx_b = np.random.randint(idx_a+1, self.n)

        a, _ = self.precomputed_dataset[idx_a]
        b, _ = self.precomputed_dataset[idx_b]

        a = list(a.numpy())
        b = list(b.numpy())
        distance = textdistance.levenshtein.distance(a, b)

        maxlen = max(len(a), len(b))
        ratio = (maxlen - distance) / float(maxlen)  # similarity ratio
        return ratio

In [28]:
eddataset = NegEDDataset(train_dataset)

# Define parallel dataloader

In [37]:
dl = torch.utils.data.DataLoader(eddataset, batch_size=32, shuffle=True, num_workers=64, drop_last=False)

In [38]:
all_ratios = []

In [None]:
print("text distance, similarities")

i = 0
for batch in dl:
    batch = batch.numpy()
    all_ratios.extend(batch)
    i += len(batch)
    if i % (32 * 10) == 0:
        average_ratio = np.mean(all_ratios)
        sdev = np.std(all_ratios)
        serr = sdev / np.sqrt(i)
        print(f"Processed {i} pairs of programs ({i / len(eddataset) * 100} %). Average ratio: {average_ratio} += {serr}")
    if i % (32 * 100) == 0:
        print(f"Saving negatives_all_ratios_{i}.npy")
        np.save(f"negatives_all_ratios_{i}.npy", np.array(all_ratios))

average_ratio = np.mean(all_ratios)
sdev = np.std(all_ratios)
serr = sdev / np.sqrt(i)
print(f"Processed {i} pairs of programs. Average ratio: {average_ratio} += {serr}")

text distance, similarities
Processed 320 pairs of programs (0.01946054162336189 %). Average ratio: 0.13756285512574812 += 0.003066001753680389
Processed 640 pairs of programs (0.03892108324672378 %). Average ratio: 0.13758622693569766 += 0.0021677638409255146
Processed 960 pairs of programs (0.05838162487008568 %). Average ratio: 0.1376094484446186 += 0.0017697874441497425
Processed 1280 pairs of programs (0.07784216649344756 %). Average ratio: 0.1376325210977131 += 0.0015325219944562678
Processed 1600 pairs of programs (0.09730270811680947 %). Average ratio: 0.13765544632171434 += 0.0013705878788569227
Processed 1920 pairs of programs (0.11676324974017135 %). Average ratio: 0.13767822552518055 += 0.001251041284922487
Processed 2240 pairs of programs (0.13622379136353327 %). Average ratio: 0.13765368299696915 += 0.0011578512856472937
Processed 2560 pairs of programs (0.15568433298689513 %). Average ratio: 0.1375979439297559 += 0.0010825415469923577
Processed 2880 pairs of programs (0.

In [41]:
average_ratio = np.mean(all_ratios)
sdev = np.std(all_ratios)
serr = sdev / np.sqrt(i)
print(f"Processed {i} pairs of programs. Average ratio: {average_ratio} += {serr}")

Processed 94304 pairs of programs. Average ratio: 0.13776112903879706 += 0.00017934406281853248


In [43]:
np.save(f"negatives_all_ratios_{i}.npy", np.array(all_ratios))