<a href="https://colab.research.google.com/github/ncoop57/t5_overlap/blob/main/overlap_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install gdown sentencepiece
! gdown https://drive.google.com/uc?id=1zKW5bOMjKHfX75d_uz8OF2teTb5Dse_M
! gdown https://drive.google.com/uc?id=1EFPcdIpl-uez4e0918Yk4hQrLRyGczxf
! gdown https://drive.google.com/uc?id=1KBVkhFZ80i1PW80aZexFGvJjyEClelX1
! unzip datasets.zip
! unzip DataSnooping_Analysis_Data.zip

Collecting gdown
  Downloading https://files.pythonhosted.org/packages/52/b9/d426f164f35bb50d512a77d6a7c5eb70b2bea3459dc10f73f130ba732810/gdown-3.13.0.tar.gz
  Installing build dependencies ... [?25ldone
[?25hCollecting filelock (from gdown)
  Downloading https://files.pythonhosted.org/packages/93/83/71a2ee6158bb9f39a90c0dea1637f81d5eef866e188e1971a1b1ab01a35a/filelock-3.0.12-py3-none-any.whl
Collecting tqdm (from gdown)
[?25l  Downloading https://files.pythonhosted.org/packages/72/8a/34efae5cf9924328a8f34eeb2fdaae14c011462d9f0e3fcded48e1266d1c/tqdm-4.60.0-py2.py3-none-any.whl (75kB)
[K    100% |████████████████████████████████| 81kB 7.0MB/s 
Building wheels for collected packages: gdown
  Running setup.py bdist_wheel for gdown ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/ba/fa/c5/12813d7496f34652c43a471e11a780e769889d06e34735c32e
Successfully built gdown
Installing collected packages: filelock, tqdm, gdown
Successfully installed filelock-3.0.12 gdown-

In [2]:
import pandas as pd

from pathlib import Path

fine_tune_path = Path("datasets/tsv/fine-tuning")

def get_agab_dfs():
    trn_agab_df = pd.read_csv(
        fine_tune_path/"AGabs/training.tsv", sep="\t", 
        names=["input", "target"]
    )
    tst_agab_df = pd.read_csv(
        "DataSnooping_Analysis_Data/AGabs.csv", index_col=0
    ).sort_values("IS_Perfect")

    return trn_agab_df, tst_agab_df

def get_agraw_dfs():
    trn_agraw_df = pd.read_csv(
        fine_tune_path/"AGraw/training.tsv", sep="\t",
        names=["input", "target"]
    )
    tst_agraw_df = pd.read_csv(
        "DataSnooping_Analysis_Data/AGraw.csv", index_col=0
    ).sort_values("IS_Perfect")

    return trn_agraw_df, tst_agraw_df

def get_bfsm_dfs():
    trn_bfsm_df = pd.read_csv(
        "datasets/tsv/fine-tuning/BFsmall/training.tsv", sep="\t",
        names=["input", "target"]
    )
    tst_bfsm_df = pd.read_csv(
        "DataSnooping_Analysis_Data/BFsmall.csv", index_col=0
    ).sort_values("IS_Perfect")

    return trn_bfsm_df, tst_bfsm_df

def get_bfmed_dfs():
    trn_bfmed_df = pd.read_csv(
        "datasets/tsv/fine-tuning/BFmedium/training.tsv", sep="\t",
        names=["input", "target"]
    )
    tst_bfmed_df = pd.read_csv(
        "DataSnooping_Analysis_Data/BFmedium.csv", index_col=0
    ).sort_values("IS_Perfect")

    return trn_bfmed_df, tst_bfmed_df

def get_codesum_dfs():
    trn_codesum_df = pd.read_csv(
        "datasets/tsv/fine-tuning/CS/training.tsv", sep="\t",
        names=["input", "target"]
    )
    tst_codesum_df = pd.read_csv(
        "DataSnooping_Analysis_Data/CodeSummarization.csv", index_col=0
    ).sort_values("BLEU")

    return trn_codesum_df, tst_codesum_df

def get_muts_dfs():
    trn_muts_df = pd.read_csv(
        "datasets/tsv/fine-tuning/MG/training.tsv", sep="\t",
        names=["input", "target"]
    )
    tst_muts_df = pd.read_csv(
        "DataSnooping_Analysis_Data/Mutants.csv", index_col=0
    ).sort_values("BLEU")

    return trn_muts_df, tst_muts_df

def sample_bst_wrst(df, pop=1_000, n=100):
    bst = df.tail(pop)
    wrst = df.head(pop)

    return bst.sample(n), wrst.sample(n)

In [3]:
# This code was taken from https://gist.github.com/kylebgorman/1081951/bce3de986e4b05fc0b63d4d9e0cfa4bde6664365
def _dist(A, B, insertion, deletion, substitution):
    D = np.zeros((len(A) + 1, len(B) + 1))
    for i in range(len(A)):
        D[i + 1][0] = D[i][0] + deletion
    for j in range(len(B)):
        D[0][j + 1] = D[0][j] + insertion
    for i in range(len(A)): # fill out middle of matrix
        for j in range(len(B)):
            if A[i] == B[j]:
                D[i + 1][j + 1] = D[i][j] # aka, it's free.
            else:
                D[i + 1][j + 1] = min(D[i + 1][j] + insertion,
                                      D[i][j + 1] + deletion,
                                      D[i][j]     + substitution)
    return D

def levenshtein_distance(l1, l2, normalize=False):
    dist = _dist(l1, l2, 1, 1, 1)[-1][-1]
    if normalize:
        return 1. - dist / max(len(l1), len(l2))
    else:
        return dist

In [5]:
import numpy as np

from joblib import Parallel, delayed
import scipy.stats as st
from statistics import mean, median, stdev

def get_dists(trn, tst):
    import sentencepiece as spm
    s = spm.SentencePieceProcessor(model_file='dl4se_vocab.model')
    s.encode("public static void main", out_type=str)
    dists = Parallel(n_jobs=-1)(
        delayed(levenshtein_distance)(s.encode(i), s.encode(j))
        for i in trn for j in tst
    )
    
    return dists

# From https://stackoverflow.com/a/51288518/5768407 by Yetti
def ci_overlap(start1, end1, start2, end2):
    """how much does the range (start1, end1) overlap with (start2, end2)"""
    return max(max((end2-start1), 0) - max((end2-end1), 0) - max((start2-start1), 0), 0)

def run_experiments(data_func, n_exp=100, pop=1_000, n_samp=100, alpha = 0.95):
    bst_dists = []
    wrst_dists = []

    trn_df, tst_df = data_func()
    for _ in range(n_exp):
        (trn_bst, trn_wrst), (tst_bst, tst_wrst) = sample_bst_wrst(trn_df, pop, n_samp), sample_bst_wrst(tst_df, pop, n_samp)
        bst_dists.extend(get_dists(trn_bst["target"].values, tst_bst["Groundtruth"].values))
        wrst_dists.extend(get_dists(trn_wrst["target"].values, tst_wrst["Groundtruth"].values))

    bst_ci = st.t.interval(
        alpha=alpha, df=len(bst_dists)-1,
        loc=np.mean(bst_dists),
        scale=st.sem(bst_dists)
    )
    wrst_ci = st.t.interval(
        alpha=alpha, df=len(wrst_dists)-1,
        loc=np.mean(wrst_dists),
        scale=st.sem(wrst_dists)
    )

    results = {
        "best": {
            "dists": bst_dists,
            "mean": mean(bst_dists),
            "median": median(bst_dists),
            "stdev": stdev(bst_dists),
            "ci": bst_ci,
        },
        "worst": {
            "dists": wrst_dists,
            "mean": mean(wrst_dists),
            "median": median(wrst_dists),
            "stdev": stdev(wrst_dists),
            "ci": wrst_ci,
        },
        "overlap": ci_overlap(*bst_ci, *wrst_ci),
        "ci_alpha": alpha,
    }

    return results

In [9]:
import json

def data_snooping_analysis(output_path, n_exp=30, pop=1_000, n_samp=100):
    datasets = [
        ("agab", get_agab_dfs), ("agraw", get_agraw_dfs),
        ("bfsm", get_bfsm_dfs), ("bfmed", get_bfmed_dfs),
        ("codesum", get_codesum_dfs), ("muts", get_muts_dfs)
    ]

    for name, ds in datasets:
        results = run_experiments(ds, n_exp=n_exp, pop=pop, n_samp=n_samp)
        with open(output_path/f"{name}_results.json", 'w') as json_file:
            json.dump(results, json_file)

In [10]:
%%time
output_path = Path("overlap_results")
data_snooping_analysis(output_path, n_exp=30, pop=1_000, n_samp=100)

CPU times: user 25min 16s, sys: 58.7 s, total: 26min 14s
Wall time: 25min 35s
