# Task Testsets

In [1]:
from argparse import Namespace
import os
import sys
from itertools import permutations, chain
from functools import partial
import json
from multiprocessing import Pool, cpu_count
import logging
from glob import glob
import re

from datasets import load_dataset
import numpy as np
from tqdm.auto import tqdm
import nltk
from nltk import ngrams
import pandas as pd

In [2]:
logging.getLogger("datasets.builder").setLevel(logging.ERROR)

In [None]:
nltk.download("punkt")

def tokenize(s):
    return [
        word.lower() for word in
        nltk.word_tokenize(s)
        if word.isalnum()
    ]

In [4]:
def get_n_gram_value(examples, example_to_tokens_f):
    n_tokens = []
    for _, example in tqdm(enumerate(examples), total=len(examples)):
        tokens = example_to_tokens_f(example)
        for ex in tokens:
            n_tokens.append(len(ex))
    return min(max(np.percentile(n_tokens, 5, interpolation="linear"), 8), 13)

In [5]:
def dataset_to_ngrams(n, examples, example_to_tokens_f):
    n_grams = set()
    # Sacrifice memory for speed
    # I could use OrderedSet but its 5x slower to search in!
    ngrams_in_example = dict()
    example_in_ngram = dict()
    for idx, example in tqdm(enumerate(examples), total=len(examples)):
        for tokens in example_to_tokens_f(example):
            for ngram in ngrams(tokens, n):
                if ngram not in example_in_ngram:
                    example_in_ngram[ngram] = []
                if idx not in example_in_ngram:
                    ngrams_in_example[idx] = []
                ngrams_in_example[idx].append(ngram)
                if idx not in example_in_ngram[ngram]:
                    example_in_ngram[ngram].append(idx)
                n_grams.add(ngram)
    return n_grams, ngrams_in_example, example_in_ngram

In [6]:
def arc_example_to_tokens(example):
    tokens = []
    for i in range(len(example["choices"]["label"])):
        tokens.append(tokenize(example["question"]) + tokenize(example["choices"]["text"][i]))
    return tokens

In [7]:
def csqa_example_to_tokens(example):
    tokens = []
    for i in range(len(example["choices"]["label"])):
        tokens.append(tokenize(example["question"]) + tokenize(example["choices"]["text"][i]))
    return tokens

In [8]:
def arct_example_to_tokens(example):
    tokens = []
    for i in range(2):
        for p_one, p_two, p_three in permutations(["reason", "claim", f"warrant{i}"]):
            tokens.append(tokenize(example[p_one]) + tokenize(example[p_two]) + tokenize(example[p_three]))
    return tokens

In [9]:
def piqa_example_to_tokens(example):
    tokens = []
    for i in range(1, 3):
        tokens.append(tokenize(example["goal"]) + tokenize(example[f"sol{i}"]))
    return tokens

In [10]:
arc = load_dataset("ai2_arc", "ARC-Challenge")
arc_n_gram = int(get_n_gram_value(arc["test"], arc_example_to_tokens))
print(f"N value: {arc_n_gram}")
arc_ngrams, arc_ngram_ex, arc_ex_ngram = dataset_to_ngrams(arc_n_gram, arc["test"], arc_example_to_tokens)
print(f"# ngrams in arc: {len(arc_ngrams)}")
arc_dirty_hits = pd.concat([
    pd.DataFrame(data={"example_id": [i], "BookCorpus_hits": [0], "ccnews_hits": [0], "openwebtext_hits": [0], "stories_hits": [0], "wikipedia_hits": [0], "atomic_hits": [0]})
    for i in range(len(arc["test"]))
]).reset_index(drop=True)
arc_df_and_lookup = (arc_dirty_hits, arc_ex_ngram)

HBox(children=(FloatProgress(value=0.0, max=1172.0), HTML(value='')))


N value: 10


HBox(children=(FloatProgress(value=0.0, max=1172.0), HTML(value='')))


# ngrams in arc: 35453


In [11]:
csqa = load_dataset("commonsense_qa")
csqa_n_gram = int(get_n_gram_value(csqa["validation"], csqa_example_to_tokens))
print(f"N value: {csqa_n_gram}")
csqa_ngrams, csqa_ngram_ex, csqa_ex_ngram = dataset_to_ngrams(csqa_n_gram, csqa["validation"], csqa_example_to_tokens)
print(f"# ngrams in csqa: {len(csqa_ngrams)}")
csqa_dirty_hits = pd.concat([
    pd.DataFrame(data={"example_id": [i], "BookCorpus_hits": [0], "ccnews_hits": [0], "openwebtext_hits": [0], "stories_hits": [0], "wikipedia_hits": [0], "atomic_hits": [0]})
    for i in range(len(csqa["validation"]))
]).reset_index(drop=True)
csqa_df_and_lookup = (csqa_dirty_hits, csqa_ex_ngram)

HBox(children=(FloatProgress(value=0.0, max=1221.0), HTML(value='')))


N value: 8


HBox(children=(FloatProgress(value=0.0, max=1221.0), HTML(value='')))


# ngrams in csqa: 16243


In [12]:
arct = list(map(lambda x: x[1], pd.read_csv(os.path.join("..", "data", "arct", "test.csv"), sep="\t").iterrows()))
arct_n_gram = int(get_n_gram_value(arct, arct_example_to_tokens))
print(f"N value: {arct_n_gram}")
arct_ngrams, arct_ngram_ex, arct_ex_ngram = dataset_to_ngrams(arct_n_gram, arct, arct_example_to_tokens)
print(f"# ngrams in arct: {len(arct_ngrams)}")
arct_dirty_hits = pd.concat([
    pd.DataFrame(data={"example_id": [i], "BookCorpus_hits": [0], "ccnews_hits": [0], "openwebtext_hits": [0], "stories_hits": [0], "wikipedia_hits": [0], "atomic_hits": [0]})
    for i in range(len(arct))
]).reset_index(drop=True)
arct_df_and_lookup = (arct_dirty_hits, arct_ex_ngram)

HBox(children=(FloatProgress(value=0.0, max=888.0), HTML(value='')))


N value: 13


HBox(children=(FloatProgress(value=0.0, max=888.0), HTML(value='')))


# ngrams in arct: 46341


In [13]:
with open(os.path.join("..", "data", "PIQA", "valid.jsonl") as fr:
    piqa = list(map(lambda line: json.loads(line), fr))
piqa_n_gram = int(get_n_gram_value(piqa, piqa_example_to_tokens))
print(f"N value: {piqa_n_gram}")
piqa_ngrams, piqa_ngram_ex, piqa_ex_ngram = dataset_to_ngrams(piqa_n_gram, piqa, piqa_example_to_tokens)
print(f"# ngrams in piqa: {len(piqa_ngrams)}")
piqa_dirty_hits = pd.concat([
    pd.DataFrame(data={"example_id": [i], "BookCorpus_hits": [0], "ccnews_hits": [0], "openwebtext_hits": [0], "stories_hits": [0], "wikipedia_hits": [0], "atomic_hits": [0]})
    for i in range(len(piqa))
]).reset_index(drop=True)
piqa_df_and_lookup = (piqa_dirty_hits, piqa_ex_ngram)

HBox(children=(FloatProgress(value=0.0, max=1838.0), HTML(value='')))


N value: 8


HBox(children=(FloatProgress(value=0.0, max=1838.0), HTML(value='')))


# ngrams in piqa: 47257


In [14]:
testset_ngrams = arc_ngrams.union(csqa_ngrams).union(arct_ngrams).union(piqa_ngrams)
len(testset_ngrams)

145294

In [15]:
ns = list(set([arc_n_gram, csqa_n_gram, arct_n_gram, piqa_n_gram]))
ns

[8, 10, 13]

In [16]:
tasks_df_and_lookup = [
    arc_df_and_lookup,
    csqa_df_and_lookup,
    arct_df_and_lookup,
    piqa_df_and_lookup,
]

# OpenWebText

In [None]:
ds = load_dataset("openwebtext")

In [17]:
def check_contamination_openwebtext(args, testset_ngrams=None, ns=None):
    dirty_hits = {ngram: 0 for ngram in testset_ngrams}
    ds = load_dataset("openwebtext")
    idxs, n_worker = args
    # cos jupyterlab is dumb
    print(' ', end='', flush=True)
    for idx, ds_idx in tqdm(enumerate(idxs), total=len(idxs), position=n_worker+1, leave=False, desc=f"Worker #{n_worker}"):
        text = tokenize(ds["train"][int(ds_idx)]["text"])
        for n in ns:
            for ngram in ngrams(text, n):
                if ngram in testset_ngrams:
                    dirty_hits[ngram] += 1
    print("Concating now")
    return pd.concat(
        [
            pd.DataFrame(data={"ngram": [k], "dirty_count": [v]})
            for k, v in dirty_hits.items()
        ]
    )

In [None]:
# 32 else race condition
n_cores = 32 #cpu_count()
pool = Pool()
results_df = pd.concat(
    tqdm(
        pool.imap_unordered(
            partial(
                check_contamination_openwebtext, testset_ngrams=testset_ngrams, ns=ns
            ),
            zip(np.array_split(range(len(ds["train"])), n_cores), list(range(n_cores)))
        ),
        total=n_cores,
        leave=False,
        position=0,
        desc="Global progress"
    ),
    ignore_index=True,
)
pool.close()
pool.join()

In [None]:
results_df

In [None]:
agg_results_df = results_df.groupby(["ngram"], as_index=False).agg({"dirty_count": "sum"})
agg_results_df

In [None]:
agg_results_df.to_csv("contamination_openwebtext.tsv", sep="\t", index=False)

In [19]:
agg_results_df = pd.read_csv("contamination_openwebtext.tsv", sep="\t")
agg_results_df["ngram"] = agg_results_df["ngram"].apply(lambda x: eval(x))

In [20]:
agg_results_df.sort_values("dirty_count", ascending=False)

Unnamed: 0,ngram,dirty_count
85437,"(oil, in, a, large, skillet, over, medium, heat)",69
59383,"(in, a, large, skillet, over, medium, heat, add)",62
123931,"(to, a, boil, then, reduce, the, heat, to)",46
21926,"(bring, to, a, boil, then, reduce, the, heat)",39
85566,"(olive, oil, in, a, large, skillet, over, medium)",37
...,...,...
48472,"(furniture, and, put, every, piece, together, ...",0
48471,"(fur, would, be, best, fit, for, which, type, ...",0
48470,"(fur, what, have, you, done, lots, of, attention)",0
48469,"(fur, this, is, an, example, of, an, animal, r...",0


In [21]:
(agg_results_df["dirty_count"] > 0).sum()

355

In [22]:
openwebtext_dirty_hits = agg_results_df[agg_results_df["dirty_count"] > 0].reset_index(drop=True)
openwebtext_dirty_hits

Unnamed: 0,ngram,dirty_count
0,"(10, to, 15, minutes, to, stiffen, it, up)",1
1,"(20, feet, away, for, 20, seconds, every, 20)",3
2,"(20, minutes, or, until, a, toothpick, inserte...",9
3,"(4, hours, or, overnight, when, ready, to, serve)",2
4,"(a, boil, reduce, the, heat, to, a, simmer)",4
...,...,...
350,"(you, can, use, a, hot, glue, gun, to)",3
351,"(you, can, use, a, pair, of, pliers, to)",1
352,"(you, want, to, learn, about, the, world, and)",1
353,"(your, computer, when, you, are, not, using, it)",1


In [23]:
for i, row in tqdm(openwebtext_dirty_hits.iterrows(), total=len(openwebtext_dirty_hits)):
    for dirty_hit_df, lookup in tasks_df_and_lookup:
        if row.ngram in lookup:
            for ex_num in lookup[row.ngram]:
                dirty_hit_df.at[ex_num, "openwebtext_hits"] += row.dirty_count

HBox(children=(FloatProgress(value=0.0, max=355.0), HTML(value='')))




In [24]:
arc_ex_dirty_num = (
    (arc_dirty_hits["BookCorpus_hits"] > 0) | (arc_dirty_hits["ccnews_hits"] > 0) | (arc_dirty_hits["openwebtext_hits"] > 0) | (arc_dirty_hits["stories_hits"] > 0) | (arc_dirty_hits["wikipedia_hits"] > 0)
).sum()
csqa_ex_dirty_num = (
    (csqa_dirty_hits["BookCorpus_hits"] > 0) | (csqa_dirty_hits["ccnews_hits"] > 0) | (csqa_dirty_hits["openwebtext_hits"] > 0) | (csqa_dirty_hits["stories_hits"] > 0) | (csqa_dirty_hits["wikipedia_hits"] > 0)
).sum()
arct_ex_dirty_num = (
    (arct_dirty_hits["BookCorpus_hits"] > 0) | (arct_dirty_hits["ccnews_hits"] > 0) | (arct_dirty_hits["openwebtext_hits"] > 0) | (arct_dirty_hits["stories_hits"] > 0) | (arct_dirty_hits["wikipedia_hits"] > 0)
).sum()
piqa_ex_dirty_num = (
    (piqa_dirty_hits["BookCorpus_hits"] > 0) | (piqa_dirty_hits["ccnews_hits"] > 0) | (piqa_dirty_hits["openwebtext_hits"] > 0) | (piqa_dirty_hits["stories_hits"] > 0) | (piqa_dirty_hits["wikipedia_hits"] > 0)
).sum()

print("Statistics after openwebtext")
print(f"ARC: {arc_ex_dirty_num}/{len(arc_dirty_hits)} ({(arc_ex_dirty_num/len(arc_dirty_hits)) * 100}%)")
print(f"CSQA: {csqa_ex_dirty_num}/{len(csqa_dirty_hits)} ({(csqa_ex_dirty_num/len(csqa_dirty_hits)) * 100}%)")
print(f"ARCT: {arct_ex_dirty_num}/{len(arct_dirty_hits)} ({(arct_ex_dirty_num/len(arct_dirty_hits)) * 100}%)")
print(f"PIQA: {piqa_ex_dirty_num}/{len(piqa_dirty_hits)} ({(piqa_ex_dirty_num/len(piqa_dirty_hits)) * 100}%)")

Statistics after openwebtext
ARC: 10/1172 (0.8532423208191127%)
CSQA: 32/1221 (2.620802620802621%)
ARCT: 0/888 (0.0%)
PIQA: 124/1838 (6.746463547334058%)


# STORIES

In [None]:
# downloaded from https://drive.google.com/drive/u/1/folders/1yZzwaV8LO1hK8ChIm0sxazXF8BSIZ683
PATH_TO_STORIES = ""
stories_files = glob(PATH_TO_STORIES)
len(stories_files)

In [25]:
def check_contamination_stories(args, testset_ngrams=None, ns=None):
    dirty_hits = {ngram: 0 for ngram in testset_ngrams}
    files, n_worker = args
    # cos jupyterlab is dumb
    print(' ', end='', flush=True)
    for idx, file_name in tqdm(enumerate(files), total=len(files), position=n_worker+1, leave=False, desc=f"Worker #{n_worker}"):
        with open(file_name, "rt") as f:
            text = tokenize(f.read())
        for n in ns:
            for ngram in ngrams(text, n):
                if ngram in testset_ngrams:
                    dirty_hits[ngram] += 1
    print("Concating now")
    return pd.concat(
        [
            pd.DataFrame(data={"ngram": [k], "dirty_count": [v]})
            for k, v in dirty_hits.items()
        ]
    )

In [None]:
n_cores = 56
pool = Pool()
results_df = pd.concat(
    tqdm(
        pool.imap_unordered(
            partial(
                check_contamination_stories, testset_ngrams=testset_ngrams, ns=ns
            ),
            zip(np.array_split(stories_files, n_cores), list(range(n_cores))),
        ),
        total=n_cores,
        leave=False,
        position=0,
        desc="Global progress",
    ),
    ignore_index=True,
)
pool.close()
pool.join()

In [None]:
agg_results_df = results_df.groupby(["ngram"], as_index=False).agg({"dirty_count": "sum"})
agg_results_df

In [None]:
agg_results_df.to_csv("contamination_stories.tsv", sep="\t", index=False)

In [26]:
agg_results_df = pd.read_csv("contamination_stories.tsv", sep="\t")
agg_results_df["ngram"] = agg_results_df["ngram"].apply(lambda x: eval(x))

In [27]:
agg_results_df.sort_values("dirty_count", ascending=False)

Unnamed: 0,ngram,dirty_count
134319,"(was, going, to, grow, up, to, be, a)",67
85437,"(oil, in, a, large, skillet, over, medium, heat)",58
59383,"(in, a, large, skillet, over, medium, heat, add)",52
136406,"(what, do, you, do, when, you, need, to)",45
85566,"(olive, oil, in, a, large, skillet, over, medium)",40
...,...,...
48462,"(fur, of, a, dog, with, light, colored, fur)",0
48461,"(fur, is, dominant, to, white, fur, in, guinea...",0
48460,"(fur, in, guinea, pigs, if, two, black, guinea...",0
48459,"(funnel, that, you, washed, out, til, you, need)",0


In [28]:
(agg_results_df["dirty_count"] > 0).sum()

297

In [29]:
stories_dirty_hits = agg_results_df[agg_results_df["dirty_count"] > 0].reset_index(drop=True)
stories_dirty_hits

Unnamed: 0,ngram,dirty_count
0,"(15, to, 20, minutes, or, until, a, toothpick)",3
1,"(2, to, 3, minutes, until, soft, add, the)",1
2,"(20, minutes, or, until, a, toothpick, inserte...",20
3,"(4, hours, or, overnight, when, ready, to, serve)",1
4,"(5, minutes, stirring, constantly, remove, fro...",5
...,...,...
292,"(you, have, to, do, in, order, to, finish)",1
293,"(you, have, to, read, a, book, that, is)",1
294,"(you, need, a, map, to, find, your, way)",5
295,"(you, want, to, kill, someone, you, can, do)",1


In [30]:
for i, row in tqdm(stories_dirty_hits.iterrows(), total=len(stories_dirty_hits)):
    for dirty_hit_df, lookup in tasks_df_and_lookup:
        if row.ngram in lookup:
            for ex_num in lookup[row.ngram]:
                dirty_hit_df.at[ex_num, "stories_hits"] += row.dirty_count

HBox(children=(FloatProgress(value=0.0, max=297.0), HTML(value='')))




In [31]:
arc_ex_dirty_num = (
    (arc_dirty_hits["BookCorpus_hits"] > 0) | (arc_dirty_hits["ccnews_hits"] > 0) | (arc_dirty_hits["openwebtext_hits"] > 0) | (arc_dirty_hits["stories_hits"] > 0) | (arc_dirty_hits["wikipedia_hits"] > 0)
).sum()
csqa_ex_dirty_num = (
    (csqa_dirty_hits["BookCorpus_hits"] > 0) | (csqa_dirty_hits["ccnews_hits"] > 0) | (csqa_dirty_hits["openwebtext_hits"] > 0) | (csqa_dirty_hits["stories_hits"] > 0) | (csqa_dirty_hits["wikipedia_hits"] > 0)
).sum()
arct_ex_dirty_num = (
    (arct_dirty_hits["BookCorpus_hits"] > 0) | (arct_dirty_hits["ccnews_hits"] > 0) | (arct_dirty_hits["openwebtext_hits"] > 0) | (arct_dirty_hits["stories_hits"] > 0) | (arct_dirty_hits["wikipedia_hits"] > 0)
).sum()
piqa_ex_dirty_num = (
    (piqa_dirty_hits["BookCorpus_hits"] > 0) | (piqa_dirty_hits["ccnews_hits"] > 0) | (piqa_dirty_hits["openwebtext_hits"] > 0) | (piqa_dirty_hits["stories_hits"] > 0) | (piqa_dirty_hits["wikipedia_hits"] > 0)
).sum()

print("Statistics after stories")
print(f"ARC: {arc_ex_dirty_num}/{len(arc_dirty_hits)} ({(arc_ex_dirty_num/len(arc_dirty_hits)) * 100}%)")
print(f"CSQA: {csqa_ex_dirty_num}/{len(csqa_dirty_hits)} ({(csqa_ex_dirty_num/len(csqa_dirty_hits)) * 100}%)")
print(f"ARCT: {arct_ex_dirty_num}/{len(arct_dirty_hits)} ({(arct_ex_dirty_num/len(arct_dirty_hits)) * 100}%)")
print(f"PIQA: {piqa_ex_dirty_num}/{len(piqa_dirty_hits)} ({(piqa_ex_dirty_num/len(piqa_dirty_hits)) * 100}%)")

Statistics after stories
ARC: 10/1172 (0.8532423208191127%)
CSQA: 53/1221 (4.340704340704341%)
ARCT: 0/888 (0.0%)
PIQA: 173/1838 (9.412404787812841%)


# BookCorpus

In [None]:
# Downloaded from https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz

PATH_TO_BOOKS = ""
book_files = glob(PATH_TO_BOOKS)
len(book_files)

In [32]:
def check_contamination_books(args, testset_ngrams=None, ns=None):
    dirty_hits = {ngram: 0 for ngram in testset_ngrams}
    files, n_worker = args
    # cos jupyterlab is dumb
    print(' ', end='', flush=True)
    for idx, file_name in tqdm(enumerate(files), total=len(files), position=n_worker+1, leave=False, desc=f"Worker #{n_worker}"):
        with open(file_name, "rt") as f:
            text = tokenize(f.read())
        for n in ns:
            for ngram in ngrams(text, n):
                if ngram in testset_ngrams:
                    dirty_hits[ngram] += 1
    print("Concating now")
    return pd.concat(
        [
            pd.DataFrame(data={"ngram": [k], "dirty_count": [v]})
            for k, v in dirty_hits.items()
        ]
    )

In [None]:
n_cores = 56
pool = Pool()
results_df = pd.concat(
    tqdm(
        pool.imap_unordered(
            partial(
                check_contamination_books, testset_ngrams=testset_ngrams, ns=ns
            ),
            zip(np.array_split(book_files, n_cores), list(range(n_cores))),
        ),
        total=n_cores,
        leave=False,
        position=0,
        desc="Global progress",
    ),
    ignore_index=True,
)
pool.close()
pool.join()

In [None]:
agg_results_df = results_df.groupby(["ngram"], as_index=False).agg({"dirty_count": "sum"})
agg_results_df

In [None]:
agg_results_df.to_csv("contamination_bookscorpus.tsv", sep="\t", index=False)

In [33]:
agg_results_df = pd.read_csv("contamination_bookscorpus.tsv", sep="\t")
agg_results_df["ngram"] = agg_results_df["ngram"].apply(lambda x: eval(x))

In [34]:
agg_results_df.sort_values("dirty_count", ascending=False)

Unnamed: 0,ngram,dirty_count
65174,"(is, the, best, way, to, get, rid, of)",9
112621,"(that, way, you, do, have, to, worry, about)",8
56706,"(how, do, you, say, i, love, you, in)",6
31879,"(could, hear, him, over, the, sound, of, the)",5
3171,"(a, hole, in, the, wall, the, size, of)",5
...,...,...
48432,"(funded, with, tax, money, this, education, sh...",0
48431,"(funded, with, tax, money, this, does, not, ha...",0
48430,"(funded, with, tax, money, this, does, goes, b...",0
48429,"(funded, with, tax, money, funding, with, tax,...",0


In [35]:
(agg_results_df["dirty_count"] > 0).sum()

72

In [36]:
books_dirty_hits = agg_results_df[agg_results_df["dirty_count"] > 0].reset_index(drop=True)
books_dirty_hits

Unnamed: 0,ngram,dirty_count
0,"(a, dream, of, hers, for, a, long, time)",1
1,"(a, hole, in, the, wall, the, size, of)",5
2,"(a, loaf, of, bread, with, a, jar, of)",1
3,"(about, the, same, thing, over, and, over, again)",4
4,"(along, the, seam, at, the, bottom, of, the)",1
...,...,...
67,"(what, will, you, do, if, you, do, not)",1
68,"(where, is, a, good, place, to, put, a)",1
69,"(where, will, you, go, when, you, die, heaven)",2
70,"(will, take, a, certain, amount, of, time, to)",1


In [37]:
for i, row in tqdm(books_dirty_hits.iterrows(), total=len(books_dirty_hits)):
    for dirty_hit_df, lookup in tasks_df_and_lookup:
        if row.ngram in lookup:
            for ex_num in lookup[row.ngram]:
                dirty_hit_df.at[ex_num, "BookCorpus_hits"] += row.dirty_count

HBox(children=(FloatProgress(value=0.0, max=72.0), HTML(value='')))




In [38]:
arc_ex_dirty_num = (
    (arc_dirty_hits["BookCorpus_hits"] > 0) | (arc_dirty_hits["ccnews_hits"] > 0) | (arc_dirty_hits["openwebtext_hits"] > 0) | (arc_dirty_hits["stories_hits"] > 0) | (arc_dirty_hits["wikipedia_hits"] > 0)
).sum()
csqa_ex_dirty_num = (
    (csqa_dirty_hits["BookCorpus_hits"] > 0) | (csqa_dirty_hits["ccnews_hits"] > 0) | (csqa_dirty_hits["openwebtext_hits"] > 0) | (csqa_dirty_hits["stories_hits"] > 0) | (csqa_dirty_hits["wikipedia_hits"] > 0)
).sum()
arct_ex_dirty_num = (
    (arct_dirty_hits["BookCorpus_hits"] > 0) | (arct_dirty_hits["ccnews_hits"] > 0) | (arct_dirty_hits["openwebtext_hits"] > 0) | (arct_dirty_hits["stories_hits"] > 0) | (arct_dirty_hits["wikipedia_hits"] > 0)
).sum()
piqa_ex_dirty_num = (
    (piqa_dirty_hits["BookCorpus_hits"] > 0) | (piqa_dirty_hits["ccnews_hits"] > 0) | (piqa_dirty_hits["openwebtext_hits"] > 0) | (piqa_dirty_hits["stories_hits"] > 0) | (piqa_dirty_hits["wikipedia_hits"] > 0)
).sum()

print("Statistics after BookCorpus")
print(f"ARC: {arc_ex_dirty_num}/{len(arc_dirty_hits)} ({(arc_ex_dirty_num/len(arc_dirty_hits)) * 100}%)")
print(f"CSQA: {csqa_ex_dirty_num}/{len(csqa_dirty_hits)} ({(csqa_ex_dirty_num/len(csqa_dirty_hits)) * 100}%)")
print(f"ARCT: {arct_ex_dirty_num}/{len(arct_dirty_hits)} ({(arct_ex_dirty_num/len(arct_dirty_hits)) * 100}%)")
print(f"PIQA: {piqa_ex_dirty_num}/{len(piqa_dirty_hits)} ({(piqa_ex_dirty_num/len(piqa_dirty_hits)) * 100}%)")

Statistics after BookCorpus
ARC: 10/1172 (0.8532423208191127%)
CSQA: 55/1221 (4.504504504504505%)
ARCT: 0/888 (0.0%)
PIQA: 177/1838 (9.630032644178455%)


# Wikipedia En

In [None]:
ds = load_dataset('wikipedia', '20200501.en', split="train")

In [39]:
def check_contamination_wikipedia(args, testset_ngrams=None, ns=None):
    dirty_hits = {ngram: 0 for ngram in testset_ngrams}
    ds = load_dataset('wikipedia', '20200501.en', split="train")
    idxs, n_worker = args
    # cos jupyterlab is dumb
    print(' ', end='', flush=True)
    for idx, ds_idx in tqdm(enumerate(idxs), total=len(idxs), position=n_worker+1, leave=False, desc=f"Worker #{n_worker}"):
        text = tokenize(ds[int(ds_idx)]["text"])
        for n in ns:
            for ngram in ngrams(text, n):
                if ngram in testset_ngrams:
                    dirty_hits[ngram] += 1
    print("Concating now")
    return pd.concat(
        [
            pd.DataFrame(data={"ngram": [k], "dirty_count": [v]})
            for k, v in dirty_hits.items()
        ]
    )

In [None]:
# 32 else race condition
n_cores = 32
pool = Pool()
results_df = pd.concat(
    tqdm(
        pool.imap_unordered(
            partial(
                check_contamination_wikipedia, testset_ngrams=testset_ngrams, ns=ns
            ),
            zip(np.array_split(range(len(ds)), n_cores), list(range(n_cores)))
        ),
        total=n_cores,
        leave=False,
        position=0,
        desc="Global progress"
    ),
    ignore_index=True,
)
pool.close()
pool.join()

In [None]:
results_df

In [None]:
agg_results_df = results_df.groupby(["ngram"], as_index=False).agg({"dirty_count": "sum"})
agg_results_df

In [None]:
agg_results_df.to_csv("contamination_wikipedia.tsv", sep="\t", index=False)

In [40]:
agg_results_df = pd.read_csv("contamination_wikipedia.tsv", sep="\t")
agg_results_df["ngram"] = agg_results_df["ngram"].apply(lambda x: eval(x))

In [41]:
agg_results_df.sort_values("dirty_count", ascending=False)

Unnamed: 0,ngram,dirty_count
43104,"(faint, constellation, in, the, southern, sky,...",2
63457,"(is, a, small, faint, constellation, in, the, ...",2
77237,"(mussels, and, other, shellfish, that, have, n...",2
30924,"(constellation, in, the, southern, sky, first,...",2
47228,"(french, astronomer, de, lacaille, its, name, ...",2
...,...,...
48433,"(funded, with, tax, money, this, education, sh...",0
48432,"(funded, with, tax, money, this, education, sh...",0
48431,"(funded, with, tax, money, this, does, not, ha...",0
48430,"(funded, with, tax, money, this, does, goes, b...",0


In [42]:
(agg_results_df["dirty_count"] > 0).sum()

46

In [43]:
wikipedia_dirty_hits = agg_results_df[agg_results_df["dirty_count"] > 0].reset_index(drop=True)
wikipedia_dirty_hits

Unnamed: 0,ngram,dirty_count
0,"(1756, by, the, french, astronomer, de, lacail...",1
1,"(a, hole, in, the, top, of, the, can)",1
2,"(a, magnet, when, the, magnet, is, moved, away...",1
3,"(a, small, faint, constellation, in, the, sout...",2
4,"(astronomer, de, lacaille, its, name, is, lati...",2
5,"(at, something, 20, feet, away, for, 20, seconds)",1
6,"(but, did, want, to, go, to, the, hospital)",1
7,"(by, the, french, astronomer, de, lacaille, it...",1
8,"(can, be, very, dangerous, and, can, lead, to)",1
9,"(circinus, is, a, small, faint, constellation,...",1


In [44]:
for i, row in tqdm(wikipedia_dirty_hits.iterrows(), total=len(wikipedia_dirty_hits)):
    for dirty_hit_df, lookup in tasks_df_and_lookup:
        if row.ngram in lookup:
            for ex_num in lookup[row.ngram]:
                dirty_hit_df.at[ex_num, "wikipedia_hits"] += row.dirty_count

HBox(children=(FloatProgress(value=0.0, max=46.0), HTML(value='')))




In [45]:
arc_ex_dirty_num = (
    (arc_dirty_hits["BookCorpus_hits"] > 0) | (arc_dirty_hits["ccnews_hits"] > 0) | (arc_dirty_hits["openwebtext_hits"] > 0) | (arc_dirty_hits["stories_hits"] > 0) | (arc_dirty_hits["wikipedia_hits"] > 0)
).sum()
csqa_ex_dirty_num = (
    (csqa_dirty_hits["BookCorpus_hits"] > 0) | (csqa_dirty_hits["ccnews_hits"] > 0) | (csqa_dirty_hits["openwebtext_hits"] > 0) | (csqa_dirty_hits["stories_hits"] > 0) | (csqa_dirty_hits["wikipedia_hits"] > 0)
).sum()
arct_ex_dirty_num = (
    (arct_dirty_hits["BookCorpus_hits"] > 0) | (arct_dirty_hits["ccnews_hits"] > 0) | (arct_dirty_hits["openwebtext_hits"] > 0) | (arct_dirty_hits["stories_hits"] > 0) | (arct_dirty_hits["wikipedia_hits"] > 0)
).sum()
piqa_ex_dirty_num = (
    (piqa_dirty_hits["BookCorpus_hits"] > 0) | (piqa_dirty_hits["ccnews_hits"] > 0) | (piqa_dirty_hits["openwebtext_hits"] > 0) | (piqa_dirty_hits["stories_hits"] > 0) | (piqa_dirty_hits["wikipedia_hits"] > 0)
).sum()

print("Statistics after Wikipedia")
print(f"ARC: {arc_ex_dirty_num}/{len(arc_dirty_hits)} ({(arc_ex_dirty_num/len(arc_dirty_hits)) * 100}%)")
print(f"CSQA: {csqa_ex_dirty_num}/{len(csqa_dirty_hits)} ({(csqa_ex_dirty_num/len(csqa_dirty_hits)) * 100}%)")
print(f"ARCT: {arct_ex_dirty_num}/{len(arct_dirty_hits)} ({(arct_ex_dirty_num/len(arct_dirty_hits)) * 100}%)")
print(f"PIQA: {piqa_ex_dirty_num}/{len(piqa_dirty_hits)} ({(piqa_ex_dirty_num/len(piqa_dirty_hits)) * 100}%)")

Statistics after Wikipedia
ARC: 11/1172 (0.938566552901024%)
CSQA: 56/1221 (4.586404586404586%)
ARCT: 0/888 (0.0%)
PIQA: 180/1838 (9.793253536452665%)


# CC-News

In [None]:
# Use crawl_ccnews.py
PATH_TO_CCNEWS = ""

cc_files = glob(os.path.join(PATH_TO_CCNEWS,"*", "*.json"), recursive=True)
len(cc_files)

In [None]:
def chunkify(a, batch_size=2):
    for i in range(a.shape[0] // batch_size):
        yield a[batch_size * i: batch_size * (i + 1)]

In [None]:
def check_contamination_ccnews(args, testset_ngrams=None, ns=None):
    dirty_hits = {ngram: 0 for ngram in testset_ngrams}
    files, n_worker = args
    batch_size = 1500
    # cos jupyterlab is dumb
    log_file = None
    print(' ', end='', flush=True)
    for _files in tqdm(chunkify(files, batch_size), total=len(files) // batch_size, position=n_worker+1, leave=False, desc=f"Worker #{n_worker}"):
        js = []
        for file_name in _files:
            try:
                with open(file_name, "rt") as f:
                    js.append(json.load(f))
            except Exception as e:
                if log_file is None:
                    log_file = open(f"error_worker#{n_worker}.log", "wt")
                print(f"{file_name}\t{repr(e)}", file=log_file, end="\n")

        for j in js:
            if j["maintext"] is not None:
                text = tokenize(j["maintext"])
                for n in ns:
                    for ngram in ngrams(text, n):
                        if ngram in testset_ngrams:
                            dirty_hits[ngram] += 1

    if log_file is not None:
        log_file.close()

    print("Concating now")
    return pd.concat(
        [
            pd.DataFrame(data={"ngram": [k], "dirty_count": [v]})
            for k, v in dirty_hits.items()
        ]
    )


In [None]:
n_cores = 88
pool = Pool()
results_df = pd.concat(
    tqdm(
        pool.imap_unordered(
            partial(
                check_contamination_ccnews, testset_ngrams=testset_ngrams, ns=ns
            ),
            zip(np.array_split(cc_files, n_cores), list(range(n_cores))),
        ),
        total=n_cores,
        leave=False,
        position=0,
        desc="Global progress",
    ),
    ignore_index=True,
)
pool.close()
pool.join()

In [None]:
agg_results_df = results_df.groupby(["ngram"], as_index=False).agg({"dirty_count": "sum"})
agg_results_df

In [None]:
agg_results_df.to_csv("contamination_ccnews.tsv", sep="\t", index=False)

In [46]:
agg_results_df = pd.read_csv("contamination_ccnews.tsv", sep="\t")
agg_results_df["ngram"] = agg_results_df["ngram"].apply(lambda x: eval(x))

In [47]:
agg_results_df.sort_values("dirty_count", ascending=False)

Unnamed: 0,ngram,dirty_count
59383,"(in, a, large, skillet, over, medium, heat, add)",264
85437,"(oil, in, a, large, skillet, over, medium, heat)",181
123931,"(to, a, boil, then, reduce, the, heat, to)",120
12396,"(and, season, to, taste, with, salt, and, pepper)",114
11731,"(and, lemon, juice, season, with, salt, and, p...",112
...,...,...
48520,"(g, of, water, in, a, cup, another, group, of,...",0
48519,"(g, of, table, salt, in, a, glass, of, water, ...",0
48518,"(g, of, strawberries, cup, 67, g, of, molasses)",0
48517,"(g, of, strawberries, cup, 67, g, of, granulated)",0


In [48]:
(agg_results_df["dirty_count"] > 0).sum()

595

In [49]:
ccnews_dirty_hits = agg_results_df[agg_results_df["dirty_count"] > 0].reset_index(drop=True)
ccnews_dirty_hits

Unnamed: 0,ngram,dirty_count
0,"(15, to, 20, minutes, or, until, a, toothpick)",5
1,"(2, tablespoons, butter, season, with, salt, a...",1
2,"(2, tsp, onion, powder, 1, tsp, sea, salt)",1
3,"(20, feet, away, for, 20, seconds, every, 20)",15
4,"(20, minutes, or, until, a, toothpick, inserte...",12
...,...,...
590,"(your, lash, line, each, night, with, a, and)",1
591,"(your, name, in, the, search, bar, on, the)",1
592,"(your, profile, picture, at, the, bottom, righ...",1
593,"(your, username, at, the, top, of, the, page)",2


In [50]:
for i, row in tqdm(ccnews_dirty_hits.iterrows(), total=len(ccnews_dirty_hits)):
    for dirty_hit_df, lookup in tasks_df_and_lookup:
        if row.ngram in lookup:
            for ex_num in lookup[row.ngram]:
                dirty_hit_df.at[ex_num, "ccnews_hits"] += row.dirty_count

HBox(children=(FloatProgress(value=0.0, max=595.0), HTML(value='')))




In [51]:
arc_ex_dirty_num = (
    (arc_dirty_hits["BookCorpus_hits"] > 0) | (arc_dirty_hits["ccnews_hits"] > 0) | (arc_dirty_hits["openwebtext_hits"] > 0) | (arc_dirty_hits["stories_hits"] > 0) | (arc_dirty_hits["wikipedia_hits"] > 0)
).sum()
csqa_ex_dirty_num = (
    (csqa_dirty_hits["BookCorpus_hits"] > 0) | (csqa_dirty_hits["ccnews_hits"] > 0) | (csqa_dirty_hits["openwebtext_hits"] > 0) | (csqa_dirty_hits["stories_hits"] > 0) | (csqa_dirty_hits["wikipedia_hits"] > 0)
).sum()
arct_ex_dirty_num = (
    (arct_dirty_hits["BookCorpus_hits"] > 0) | (arct_dirty_hits["ccnews_hits"] > 0) | (arct_dirty_hits["openwebtext_hits"] > 0) | (arct_dirty_hits["stories_hits"] > 0) | (arct_dirty_hits["wikipedia_hits"] > 0)
).sum()
piqa_ex_dirty_num = (
    (piqa_dirty_hits["BookCorpus_hits"] > 0) | (piqa_dirty_hits["ccnews_hits"] > 0) | (piqa_dirty_hits["openwebtext_hits"] > 0) | (piqa_dirty_hits["stories_hits"] > 0) | (piqa_dirty_hits["wikipedia_hits"] > 0)
).sum()

print("Statistics after CCNews")
print(f"ARC: {arc_ex_dirty_num}/{len(arc_dirty_hits)} ({(arc_ex_dirty_num/len(arc_dirty_hits)) * 100}%)")
print(f"CSQA: {csqa_ex_dirty_num}/{len(csqa_dirty_hits)} ({(csqa_ex_dirty_num/len(csqa_dirty_hits)) * 100}%)")
print(f"ARCT: {arct_ex_dirty_num}/{len(arct_dirty_hits)} ({(arct_ex_dirty_num/len(arct_dirty_hits)) * 100}%)")
print(f"PIQA: {piqa_ex_dirty_num}/{len(piqa_dirty_hits)} ({(piqa_ex_dirty_num/len(piqa_dirty_hits)) * 100}%)")

Statistics after CCNews
ARC: 14/1172 (1.1945392491467577%)
CSQA: 62/1221 (5.077805077805078%)
ARCT: 0/888 (0.0%)
PIQA: 243/1838 (13.2208922742111%)


# ATOMIC2020

In [None]:
PATH_TO_ATOMIC = ""
df = pd.read_csv(os.path.join(PATH_TO_ATOMIC, "train.tsv"), sep="\t", header=None, names=["head", "rel", "tail"])
df

In [None]:
df["tail"] = df["tail"].apply(lambda x: "nan" if isinstance(x, float) else x)
df

In [52]:
def check_contamination_atomic(args, testset_ngrams=None, ns=None):
    dirty_hits = {ngram: 0 for ngram in testset_ngrams}
    df, n_worker = args
    # cos jupyterlab is dumb
    print(' ', end='', flush=True)
    for idx, row in tqdm(df.iterrows(), total=len(df), position=n_worker+1, leave=False, desc=f"Worker #{n_worker}"):
        text = tokenize(" ".join([row["tail"], row["rel"], row["tail"]]))
        for n in ns:
            for ngram in ngrams(text, n):
                if ngram in testset_ngrams:
                    dirty_hits[ngram] += 1
    print("Concating now")
    return pd.concat(
        [
            pd.DataFrame(data={"ngram": [k], "dirty_count": [v]})
            for k, v in dirty_hits.items()
        ]
    )

In [None]:
n_cores = 16
pool = Pool()
results_df = pd.concat(
    tqdm(
        pool.imap_unordered(
            partial(
                check_contamination_atomic, testset_ngrams=testset_ngrams, ns=ns
            ),
            zip(np.array_split(df, n_cores), list(range(n_cores)))
        ),
        total=n_cores,
        leave=False,
        position=0,
        desc="Global progress"
    ),
    ignore_index=True,
)
pool.close()
pool.join()

In [None]:
results_df

In [None]:
agg_results_df = results_df.groupby(["ngram"], as_index=False).agg({"dirty_count": "sum"})
agg_results_df

In [None]:
agg_results_df.to_csv("contamination_atomic.tsv", sep="\t", index=False)

In [53]:
agg_results_df = pd.read_csv("contamination_atomic.tsv", sep="\t")
agg_results_df["ngram"] = agg_results_df["ngram"].apply(lambda x: eval(x))

In [54]:
agg_results_df.sort_values("dirty_count", ascending=False)

Unnamed: 0,ngram,dirty_count
0,"(1, 16, ounce, can, chocolate, flavored, syrup...",0
96865,"(reading, them, other, people, often, think, s...",0
96859,"(reading, them, comment, sections, have, faile...",0
96860,"(reading, them, comment, sections, have, not, ...",0
96861,"(reading, them, comment, sections, have, not, ...",0
...,...,...
48425,"(funded, with, tax, money, college, should, no...",0
48424,"(funded, with, tax, money, college, should, no...",0
48423,"(funded, with, tax, money, college, should, no...",0
48422,"(funded, with, tax, money, college, should, be...",0


In [55]:
(agg_results_df["dirty_count"] > 0).sum()

0

In [56]:
atomic_dirty_hits = agg_results_df[agg_results_df["dirty_count"] > 0].reset_index(drop=True)
atomic_dirty_hits

Unnamed: 0,ngram,dirty_count


In [57]:
for i, row in tqdm(atomic_dirty_hits.iterrows(), total=len(atomic_dirty_hits)):
    for dirty_hit_df, lookup in tasks_df_and_lookup:
        if row.ngram in lookup:
            for ex_num in lookup[row.ngram]:
                dirty_hit_df.at[ex_num, "atomic_hits"] += row.dirty_count

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [58]:
arc_ex_dirty_num = (
    (arc_dirty_hits["BookCorpus_hits"] > 0) | (arc_dirty_hits["ccnews_hits"] > 0) | (arc_dirty_hits["openwebtext_hits"] > 0) | (arc_dirty_hits["stories_hits"] > 0) | (arc_dirty_hits["wikipedia_hits"] > 0) | (arc_dirty_hits["atomic_hits"] > 0)
).sum()
csqa_ex_dirty_num = (
    (csqa_dirty_hits["BookCorpus_hits"] > 0) | (csqa_dirty_hits["ccnews_hits"] > 0) | (csqa_dirty_hits["openwebtext_hits"] > 0) | (csqa_dirty_hits["stories_hits"] > 0) | (csqa_dirty_hits["wikipedia_hits"] > 0) | (csqa_dirty_hits["atomic_hits"] > 0)
).sum()
arct_ex_dirty_num = (
    (arct_dirty_hits["BookCorpus_hits"] > 0) | (arct_dirty_hits["ccnews_hits"] > 0) | (arct_dirty_hits["openwebtext_hits"] > 0) | (arct_dirty_hits["stories_hits"] > 0) | (arct_dirty_hits["wikipedia_hits"] > 0) | (arct_dirty_hits["atomic_hits"] > 0)
).sum()
piqa_ex_dirty_num = (
    (piqa_dirty_hits["BookCorpus_hits"] > 0) | (piqa_dirty_hits["ccnews_hits"] > 0) | (piqa_dirty_hits["openwebtext_hits"] > 0) | (piqa_dirty_hits["stories_hits"] > 0) | (piqa_dirty_hits["wikipedia_hits"] > 0) | (piqa_dirty_hits["atomic_hits"] > 0)
).sum()

print("Statistics after ATOMIC")
print(f"ARC: {arc_ex_dirty_num}/{len(arc_dirty_hits)} ({(arc_ex_dirty_num/len(arc_dirty_hits)) * 100}%)")
print(f"CSQA: {csqa_ex_dirty_num}/{len(csqa_dirty_hits)} ({(csqa_ex_dirty_num/len(csqa_dirty_hits)) * 100}%)")
print(f"ARCT: {arct_ex_dirty_num}/{len(arct_dirty_hits)} ({(arct_ex_dirty_num/len(arct_dirty_hits)) * 100}%)")
print(f"PIQA: {piqa_ex_dirty_num}/{len(piqa_dirty_hits)} ({(piqa_ex_dirty_num/len(piqa_dirty_hits)) * 100}%)")

Statistics after ATOMIC
ARC: 14/1172 (1.1945392491467577%)
CSQA: 62/1221 (5.077805077805078%)
ARCT: 0/888 (0.0%)
PIQA: 243/1838 (13.2208922742111%)


# RoBERTa PIQA Influence Test

In [59]:
sys.path.append(os.path.join("..", "RoBERTa"))

In [70]:
from models import RobertaForMultipleChoicePIQA
import torch
from sklearn.metrics import accuracy_score

In [61]:
hparams = Namespace(
    learning_rate=1e-3,
    gradient_accumulation_steps=0,
    seed=42,
    model_name="roberta-large",
    batch_size=32,
    max_seq_len=115,
    weight_decay=0.1,
    warmup_ratio=0.06,
)

In [62]:
PATH_TO_ROBERTA_PIQA_MODEL_CHECKPOINT = ""

roberta = RobertaForMultipleChoicePIQA.load_from_checkpoint(
    PATH_TO_ROBERTA_PIQA_MODEL_CHECKPOINT,
    hparams=hparams,
    data_path=os.path.join("..", "data", "PIQA"),
    epochs=10,
    lr_schedule=None,
    num_classes=2,
)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForMultipleChoice: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [63]:
roberta = roberta.cuda()

In [64]:
piqa_dirty = (piqa_dirty_hits["BookCorpus_hits"] > 0) | (piqa_dirty_hits["ccnews_hits"] > 0) | (piqa_dirty_hits["openwebtext_hits"] > 0) | (piqa_dirty_hits["stories_hits"] > 0) | (piqa_dirty_hits["wikipedia_hits"] > 0) | (piqa_dirty_hits["atomic_hits"] > 0)

In [65]:
piqa_dirty_idxs = np.nonzero(piqa_dirty.values)[0]
piqa_dirty_idxs

array([  18,   22,   29,   47,   68,   82,   85,   87,   96,  107,  135,
        139,  142,  147,  152,  157,  159,  164,  167,  199,  200,  216,
        220,  223,  227,  233,  237,  239,  242,  244,  247,  254,  264,
        267,  273,  274,  275,  276,  278,  282,  283,  292,  306,  313,
        314,  318,  337,  346,  354,  361,  362,  363,  373,  374,  393,
        414,  425,  430,  431,  437,  443,  470,  474,  480,  485,  513,
        514,  518,  522,  544,  546,  553,  562,  569,  581,  587,  590,
        596,  639,  641,  648,  650,  654,  679,  692,  702,  703,  705,
        710,  712,  717,  735,  738,  743,  745,  746,  747,  768,  772,
        782,  786,  800,  828,  831,  835,  849,  852,  858,  860,  861,
        870,  871,  880,  886,  891,  920,  922,  928,  936,  939,  944,
        946,  956,  962,  969,  976,  985,  987,  990,  997, 1010, 1013,
       1027, 1037, 1046, 1054, 1057, 1066, 1078, 1085, 1086, 1097, 1099,
       1105, 1112, 1119, 1127, 1138, 1157, 1161, 11

In [74]:
piqa_clean_idxs = np.nonzero(~piqa_dirty.values)[0]

In [66]:
roberta.prepare_data()

100%|██████████| 14501/14501 [00:09<00:00, 1565.07it/s]
100%|██████████| 1612/1612 [00:00<00:00, 1645.64it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1647.00it/s]


In [67]:
y_pred = []
y_true = []

b_idx = 0

with torch.no_grad():
    while b_idx * 32 < len(roberta.test_data):
        input_ids, attention_mask, y = roberta.test_data[b_idx * 32: b_idx * 32 + 32]
        input_ids, attention_mask = input_ids.cuda(), attention_mask.cuda()
        ys = roberta(input_ids, attention_mask)[0].cpu()
        input_ids, attention_mask = input_ids.cpu(), attention_mask.cpu()
        y_pred.extend(ys.cpu().tolist())
        y_true.extend(y.tolist())
        b_idx += 1

In [72]:
y_pred = np.array(y_pred)
y_true = np.array(y_true)

In [75]:
accuracy_score(y_true[piqa_dirty_idxs], y_pred[piqa_dirty_idxs])

0.8353909465020576

In [76]:
accuracy_score(y_true[piqa_clean_idxs], y_pred[piqa_clean_idxs])

0.7893416927899687

In [77]:
roberta = roberta.cpu()
del roberta

In [78]:
torch.cuda.empty_cache()

# ARC RoBERTa Influence Test

In [79]:
from models import RobertaForRankingARC

In [80]:
hparams = Namespace(
    learning_rate=1e-3,
    gradient_accumulation_steps=0,
    seed=42,
    model_name="roberta-large",
    batch_size=32,
    max_seq_len=90,
    weight_decay=0.1,
    warmup_ratio=0.06,
)

In [81]:
PATH_TO_ROBERTA_ARC_MODEL_CHECKPOINT = ""

roberta = RobertaForRankingARC.load_from_checkpoint(
    PATH_TO_ROBERTA_ARC_MODEL_CHECKPOINT,
    hparams=hparams,
    data_path=None,
    epochs=10,
    lr_schedule=None,
    num_classes=5,
)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForVariableMultipleChoice: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForVariableMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForVariableMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForVariableMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream t

In [82]:
roberta = roberta.cuda()

In [84]:
arc_dirty = (arc_dirty_hits["BookCorpus_hits"] > 0) | (arc_dirty_hits["ccnews_hits"] > 0) | (arc_dirty_hits["openwebtext_hits"] > 0) | (arc_dirty_hits["stories_hits"] > 0) | (arc_dirty_hits["wikipedia_hits"] > 0) | (arc_dirty_hits["atomic_hits"] > 0)

In [85]:
arc_dirty_idxs = np.nonzero(arc_dirty.values)[0]

In [86]:
arc_clean_idxs = np.nonzero(~arc_dirty.values)[0]

In [87]:
roberta.prepare_data()

100%|██████████| 1119/1119 [00:01<00:00, 707.71it/s]
100%|██████████| 299/299 [00:00<00:00, 719.67it/s]
100%|██████████| 1172/1172 [00:01<00:00, 759.23it/s]


In [88]:
y_pred = []
y_true = []

b_idx = 0

with torch.no_grad():
    while b_idx * 32 < len(roberta.test_data):
        input_ids, attention_mask, y = roberta.test_data[b_idx * 32: b_idx * 32 + 32]
        input_ids, attention_mask = input_ids.cuda(), attention_mask.cuda()
        ys = roberta(input_ids, attention_mask)[0].cpu()
        input_ids, attention_mask = input_ids.cpu(), attention_mask.cpu()
        y_pred.extend(ys.cpu().tolist())
        y_true.extend(y.tolist())
        b_idx += 1

In [89]:
y_pred = np.array(y_pred)
y_true = np.array(y_true)

In [90]:
accuracy_score(y_true[arc_dirty_idxs], y_pred[arc_dirty_idxs])

0.7142857142857143

In [91]:
accuracy_score(y_true[arc_clean_idxs], y_pred[arc_clean_idxs])

0.4317789291882556

In [92]:
roberta = roberta.cpu()
del roberta

In [93]:
torch.cuda.empty_cache()

# CSQA RoBERTa Influence Test

In [94]:
from models import RobertaForCSQA

In [95]:
hparams = Namespace(
    learning_rate=1e-3,
    gradient_accumulation_steps=0,
    seed=42,
    model_name="roberta-large",
    batch_size=32,
    max_seq_len=90,
    weight_decay=0.1,
    warmup_ratio=0.06,
)

In [96]:
PATH_TO_CSQA_MODEL_CHECKPOINT = ""

roberta = RobertaForCSQA.load_from_checkpoint(
    PATH_TO_CSQA_MODEL_CHECKPOINT,
    hparams=hparams,
    data_path=None,
    epochs=10,
    lr_schedule=None,
    num_classes=5,
)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForMultipleChoice: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [97]:
roberta = roberta.cuda()

In [98]:
csqa_dirty = (csqa_dirty_hits["BookCorpus_hits"] > 0) | (csqa_dirty_hits["ccnews_hits"] > 0) | (csqa_dirty_hits["openwebtext_hits"] > 0) | (csqa_dirty_hits["stories_hits"] > 0) | (csqa_dirty_hits["wikipedia_hits"] > 0) | (csqa_dirty_hits["atomic_hits"] > 0)

In [99]:
csqa_dirty_idxs = np.nonzero(csqa_dirty.values)[0]

In [100]:
csqa_clean_idxs = np.nonzero(~csqa_dirty.values)[0]

In [101]:
roberta.prepare_data()

100%|██████████| 8766/8766 [00:11<00:00, 794.53it/s]
100%|██████████| 975/975 [00:01<00:00, 811.79it/s]
100%|██████████| 1221/1221 [00:01<00:00, 782.96it/s]


In [102]:
y_pred = []
y_true = []

b_idx = 0

with torch.no_grad():
    while b_idx * 32 < len(roberta.test_data):
        input_ids, attention_mask, y = roberta.test_data[b_idx * 32: b_idx * 32 + 32]
        input_ids, attention_mask = input_ids.cuda(), attention_mask.cuda()
        ys = roberta(input_ids, attention_mask)[0].cpu()
        input_ids, attention_mask = input_ids.cpu(), attention_mask.cpu()
        y_pred.extend(ys.cpu().tolist())
        y_true.extend(y.tolist())
        b_idx += 1

In [103]:
y_pred = np.array(y_pred)
y_true = np.array(y_true)

In [104]:
accuracy_score(y_true[csqa_dirty_idxs], y_pred[csqa_dirty_idxs])

0.7258064516129032

In [105]:
accuracy_score(y_true[csqa_clean_idxs], y_pred[csqa_clean_idxs])

0.7385677308024159

In [106]:
roberta = roberta.cpu()
del roberta

In [107]:
torch.cuda.empty_cache()

# COMET(BART) PIQA Influence Test

Restart kernel here cos of import issues

In [60]:
sys.path.append(os.path.join("..", "BART_AND_COMET"))

In [61]:
from models import BARTForPIQA
import torch
from sklearn.metrics import accuracy_score

In [62]:
PATH_COMET_PRETRAIN_CHECKPOINT = ""

hparams = Namespace(
    learning_rate=1e-3,
    gradient_accumulation_steps=0,
    seed=42,
    model_name="facebook/bart-large",
    pretrained_weights=PATH_COMET_PRETRAIN_CHECKPOINT,
    batch_size=32,
    max_seq_len=101,
    weight_decay=0.1,
    warmup_ratio=0.06,
)

In [None]:
PATH_TO_COMET_PIQA_CHECKPOINT = ""

bart = BARTForPIQA.load_from_checkpoint(
    PATH_TO_COMET_PIQA_CHECKPOINT,
    hparams=hparams,
    data_path=os.path.join("..", "data", "PIQA"),
    epochs=10,
    lr_schedule=None,
    num_classes=2,
)

In [65]:
bart = bart.cuda()

In [66]:
piqa_dirty = (piqa_dirty_hits["BookCorpus_hits"] > 0) | (piqa_dirty_hits["ccnews_hits"] > 0) | (piqa_dirty_hits["openwebtext_hits"] > 0) | (piqa_dirty_hits["stories_hits"] > 0) | (piqa_dirty_hits["wikipedia_hits"] > 0) | (piqa_dirty_hits["atomic_hits"] > 0)

In [67]:
piqa_dirty_idxs = np.nonzero(piqa_dirty.values)[0]
piqa_dirty_idxs

array([  18,   22,   29,   47,   68,   82,   85,   87,   96,  107,  135,
        139,  142,  147,  152,  157,  159,  164,  167,  199,  200,  216,
        220,  223,  227,  233,  237,  239,  242,  244,  247,  254,  264,
        267,  273,  274,  275,  276,  278,  282,  283,  292,  306,  313,
        314,  318,  337,  346,  354,  361,  362,  363,  373,  374,  393,
        414,  425,  430,  431,  437,  443,  470,  474,  480,  485,  513,
        514,  518,  522,  544,  546,  553,  562,  569,  581,  587,  590,
        596,  639,  641,  648,  650,  654,  679,  692,  702,  703,  705,
        710,  712,  717,  735,  738,  743,  745,  746,  747,  768,  772,
        782,  786,  800,  828,  831,  835,  849,  852,  858,  860,  861,
        870,  871,  880,  886,  891,  920,  922,  928,  936,  939,  944,
        946,  956,  962,  969,  976,  985,  987,  990,  997, 1010, 1013,
       1027, 1037, 1046, 1054, 1057, 1066, 1078, 1085, 1086, 1097, 1099,
       1105, 1112, 1119, 1127, 1138, 1157, 1161, 11

In [68]:
piqa_clean_idxs = np.nonzero(~piqa_dirty.values)[0]

In [69]:
bart.prepare_data()

100%|██████████| 14501/14501 [00:10<00:00, 1330.97it/s]
100%|██████████| 1612/1612 [00:01<00:00, 1405.21it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1412.02it/s]


In [71]:
y_pred = []
y_true = []

b_idx = 0

with torch.no_grad():
    while b_idx * 32 < len(bart.test_data):
        input_ids, attention_mask, y = bart.test_data[b_idx * 32: b_idx * 32 + 32]
        input_ids, attention_mask = input_ids.cuda(), attention_mask.cuda()
        ys = bart(input_ids, attention_mask)[0].cpu()
        input_ids, attention_mask = input_ids.cpu(), attention_mask.cpu()
        y_pred.extend(ys.cpu().tolist())
        y_true.extend(y.tolist())
        b_idx += 1

In [72]:
y_pred = np.array(y_pred)
y_true = np.array(y_true)

In [73]:
accuracy_score(y_true[piqa_dirty_idxs], y_pred[piqa_dirty_idxs])

0.8189300411522634

In [74]:
accuracy_score(y_true[piqa_clean_idxs], y_pred[piqa_clean_idxs])

0.7899686520376176

In [76]:
bart = bart.cpu()
del bart

In [77]:
torch.cuda.empty_cache()

# COMET(BART) ARC Influence Test

In [78]:
from models import BARTForARC

In [79]:
hparams = Namespace(
    learning_rate=1e-3,
    gradient_accumulation_steps=0,
    seed=42,
    model_name="facebook/bart-large",
    pretrained_weights=PATH_COMET_PRETRAIN_CHECKPOINT,
    batch_size=32,
    max_seq_len=91,
    weight_decay=0.1,
    warmup_ratio=0.06,
)

In [None]:
PATH_TO_COMET_ARC_CHECKPOINT = ""

bart = BARTForARC.load_from_checkpoint(
    PATH_TO_COMET_ARC_CHECKPOINT,
    hparams=hparams,
    data_path=None,
    epochs=10,
    lr_schedule=None,
    num_classes=5,
)

In [81]:
bart = bart.cuda()

In [82]:
arc_dirty = (arc_dirty_hits["BookCorpus_hits"] > 0) | (arc_dirty_hits["ccnews_hits"] > 0) | (arc_dirty_hits["openwebtext_hits"] > 0) | (arc_dirty_hits["stories_hits"] > 0) | (arc_dirty_hits["wikipedia_hits"] > 0) | (arc_dirty_hits["atomic_hits"] > 0)

In [83]:
arc_dirty_idxs = np.nonzero(arc_dirty.values)[0]

In [84]:
arc_clean_idxs = np.nonzero(~arc_dirty.values)[0]

In [85]:
bart.prepare_data()

100%|██████████| 1119/1119 [00:01<00:00, 569.51it/s]
100%|██████████| 299/299 [00:00<00:00, 635.63it/s]
100%|██████████| 1172/1172 [00:01<00:00, 664.60it/s]


In [86]:
y_pred = []
y_true = []

b_idx = 0

with torch.no_grad():
    while b_idx * 32 < len(bart.test_data):
        input_ids, attention_mask, y = bart.test_data[b_idx * 32: b_idx * 32 + 32]
        input_ids, attention_mask = input_ids.cuda(), attention_mask.cuda()
        ys = bart(input_ids, attention_mask)[0].cpu()
        input_ids, attention_mask = input_ids.cpu(), attention_mask.cpu()
        y_pred.extend(ys.cpu().tolist())
        y_true.extend(y.tolist())
        b_idx += 1

In [87]:
y_pred = np.array(y_pred)
y_true = np.array(y_true)

In [88]:
accuracy_score(y_true[arc_dirty_idxs], y_pred[arc_dirty_idxs])

0.6428571428571429

In [89]:
accuracy_score(y_true[arc_clean_idxs], y_pred[arc_clean_idxs])

0.41968911917098445

In [90]:
bart = bart.cpu()
del bart

In [91]:
torch.cuda.empty_cache()

# COMET(BART) CSQA Influence Test

In [92]:
from models import BARTForCSQA

In [93]:
hparams = Namespace(
    learning_rate=1e-3,
    gradient_accumulation_steps=0,
    seed=42,
    model_name="facebook/bart-large",
    pretrained_weights=PATH_COMET_PRETRAIN_CHECKPOINT,
    batch_size=32,
    max_seq_len=88,
    weight_decay=0.1,
    warmup_ratio=0.06,
)

In [None]:
PATH_TO_COMET_CSQA_CHECKPOINT = ""

bart = BARTForCSQA.load_from_checkpoint(
    PATH_TO_COMET_CSQA_CHECKPOINT,
    hparams=hparams,
    data_path=None,
    epochs=10,
    lr_schedule=None,
    num_classes=5,
)

In [95]:
bart = bart.cuda()

In [96]:
csqa_dirty = (csqa_dirty_hits["BookCorpus_hits"] > 0) | (csqa_dirty_hits["ccnews_hits"] > 0) | (csqa_dirty_hits["openwebtext_hits"] > 0) | (csqa_dirty_hits["stories_hits"] > 0) | (csqa_dirty_hits["wikipedia_hits"] > 0) | (csqa_dirty_hits["atomic_hits"] > 0)

In [97]:
csqa_dirty_idxs = np.nonzero(csqa_dirty.values)[0]

In [98]:
csqa_clean_idxs = np.nonzero(~csqa_dirty.values)[0]

In [99]:
bart.prepare_data()

100%|██████████| 8766/8766 [00:11<00:00, 777.38it/s]
100%|██████████| 975/975 [00:01<00:00, 802.78it/s]
100%|██████████| 1221/1221 [00:01<00:00, 768.01it/s]


In [100]:
y_pred = []
y_true = []

b_idx = 0

with torch.no_grad():
    while b_idx * 32 < len(bart.test_data):
        input_ids, attention_mask, y = bart.test_data[b_idx * 32: b_idx * 32 + 32]
        input_ids, attention_mask = input_ids.cuda(), attention_mask.cuda()
        ys = bart(input_ids, attention_mask)[0].cpu()
        input_ids, attention_mask = input_ids.cpu(), attention_mask.cpu()
        y_pred.extend(ys.cpu().tolist())
        y_true.extend(y.tolist())
        b_idx += 1

In [101]:
y_pred = np.array(y_pred)
y_true = np.array(y_true)

In [102]:
accuracy_score(y_true[csqa_dirty_idxs], y_pred[csqa_dirty_idxs])

0.7096774193548387

In [103]:
accuracy_score(y_true[csqa_clean_idxs], y_pred[csqa_clean_idxs])

0.727351164797239

# Answer Balance

In [6]:
from collections import Counter

In [None]:
# ARC
ds = load_dataset("ai2_arc", "ARC-Challenge")

In [25]:
for split in ds:
    count = Counter()
    for ex in ds[split]:
        count[ex["choices"]["label"].index(ex['answerKey'])] += 1
    counts = sorted(count.most_common(), key=lambda x: x[0])
    print(split)
    for answ_n, occurrences in counts:
        print(f"{answ_n}:", f"Occurrences={occurrences}", f"Percentage={round(occurrences / len(ds[split]), 3)}")
    print()

train
0: Occurrences=239 Percentage=0.214
1: Occurrences=296 Percentage=0.265
2: Occurrences=291 Percentage=0.26
3: Occurrences=293 Percentage=0.262

test
0: Occurrences=266 Percentage=0.227
1: Occurrences=311 Percentage=0.265
2: Occurrences=310 Percentage=0.265
3: Occurrences=285 Percentage=0.243

validation
0: Occurrences=64 Percentage=0.214
1: Occurrences=73 Percentage=0.244
2: Occurrences=78 Percentage=0.261
3: Occurrences=83 Percentage=0.278
4: Occurrences=1 Percentage=0.003



In [None]:
# CSQA
ds = load_dataset("commonsense_qa")

In [27]:
for split in filter(lambda x: x != "test", ds):
    count = Counter()
    for ex in ds[split]:
        count[ex["choices"]["label"].index(ex['answerKey'])] += 1
    counts = sorted(count.most_common(), key=lambda x: x[0])
    print(split)
    for answ_n, occurrences in counts:
        print(f"{answ_n}:", f"Occurrences={occurrences}", f"Percentage={round(occurrences / len(ds[split]), 3)}")
    print()

train
0: Occurrences=1909 Percentage=0.196
1: Occurrences=1973 Percentage=0.203
2: Occurrences=1946 Percentage=0.2
3: Occurrences=1985 Percentage=0.204
4: Occurrences=1928 Percentage=0.198

validation
0: Occurrences=239 Percentage=0.196
1: Occurrences=255 Percentage=0.209
2: Occurrences=241 Percentage=0.197
3: Occurrences=251 Percentage=0.206
4: Occurrences=235 Percentage=0.192



In [34]:
# ARCT
train = list(map(lambda x: x[1], pd.read_csv(os.path.join("..", "data", "arct", "train.csv"), sep="\t").iterrows()))
dev = list(map(lambda x: x[1], pd.read_csv(os.path.join("..", "data", "arct", "dev.csv"), sep="\t").iterrows()))
test = list(map(lambda x: x[1], pd.read_csv(os.path.join("..", "data", "arct", "test.csv"), sep="\t").iterrows()))

In [39]:
for split, df in [("train", train), ("dev", dev), ("test", test)]:
    count = Counter()
    for ex in df:
        count[ex["correctLabelW0orW1"]] += 1
    counts = sorted(count.most_common(), key=lambda x: x[0])
    print(split)
    for answ_n, occurrences in counts:
        print(f"{answ_n}:", f"Occurrences={occurrences}", f"Percentage={round(occurrences / len(df), 3)}")
    print()

train
0: Occurrences=1210 Percentage=0.5
1: Occurrences=1210 Percentage=0.5

dev
0: Occurrences=316 Percentage=0.5
1: Occurrences=316 Percentage=0.5

test
0: Occurrences=444 Percentage=0.5
1: Occurrences=444 Percentage=0.5



In [44]:
# PIQA
with open(os.path.join("..", "data", "PIQA", "train-labels.lst")) as fr:
    train = list(map(lambda x: int(x), fr))

with open(os.path.join("..", "data", "PIQA", "valid-labels.lst")) as fr:
    dev = list(map(lambda x: int(x), fr))

In [47]:
for split, df in [("train", train), ("dev", dev)]:
    count = Counter()
    for ex in df:
        count[ex] += 1
    counts = sorted(count.most_common(), key=lambda x: x[0])
    print(split)
    for answ_n, occurrences in counts:
        print(f"{answ_n}:", f"Occurrences={occurrences}", f"Percentage={round(occurrences / len(df), 3)}")
    print()

train
0: Occurrences=8053 Percentage=0.5
1: Occurrences=8060 Percentage=0.5

dev
0: Occurrences=910 Percentage=0.495
1: Occurrences=928 Percentage=0.505



# Statistical Cue (Niven style)

In [66]:
from collections import defaultdict
from functools import reduce

In [123]:
def calculate_cue_metrics(exs, n):
    counters = defaultdict(Counter)
    
    for ex in exs:
        choices_ngrams = [
            set(ngrams(tokenize(choice), n))
            for choice in ex["choices"]
        ]
        for i, choice in enumerate(choices_ngrams):
            diff = choice.difference(reduce(lambda acc, x: acc.union(x), choices_ngrams[:i] + choices_ngrams[i + 1:], set()))
            for cue in diff:
                counters["applicability"][cue] += 1
                if i == ex["target"]:
                    counters["applicability_right"][cue] += 1
    
    for cue in counters["applicability"]:
        counters["productivity"][cue] =  counters["applicability_right"][cue] / counters["applicability"][cue]
        counters["coverage"][cue] = counters["applicability"][cue] / len(exs)
    
    return counters

In [None]:
# ARC

ds = load_dataset("ai2_arc", "ARC-Challenge")
metrics_uni = calculate_cue_metrics(
    [
        {"choices": ex["choices"]["text"], "target": ex["choices"]["label"].index(ex["answerKey"])}
        for split_ds in [ds["train"], ds["validation"], ds["test"]]
        for ex in split_ds
    ],
    1
)
metrics_bi = calculate_cue_metrics(
    [
        {"choices": ex["choices"]["text"], "target": ex["choices"]["label"].index(ex["answerKey"])}
        for split_ds in [ds["train"], ds["validation"], ds["test"]]
        for ex in split_ds
    ],
    2
)

In [126]:
for cue, coverage in metrics_uni["coverage"].most_common(25):
    print(cue, coverage, metrics_uni["productivity"][cue], f"useful={metrics_uni['productivity'][cue] > 1/4}")

('to',) 0.13281853281853281 0.25872093023255816 useful=True
('in',) 0.12664092664092663 0.25 useful=False
('of',) 0.12625482625482626 0.24770642201834864 useful=False
('a',) 0.10926640926640926 0.2226148409893993 useful=False
('the',) 0.09266409266409266 0.24583333333333332 useful=False
('water',) 0.0918918918918919 0.15126050420168066 useful=False
('from',) 0.0694980694980695 0.22777777777777777 useful=False
('and',) 0.06254826254826255 0.4074074074074074 useful=True
('on',) 0.062162162162162166 0.2608695652173913 useful=True
('for',) 0.05328185328185328 0.2898550724637681 useful=True
('an',) 0.04903474903474903 0.29133858267716534 useful=True
('more',) 0.04749034749034749 0.17073170731707318 useful=False
('is',) 0.044787644787644784 0.16379310344827586 useful=False
('food',) 0.04247104247104247 0.35454545454545455 useful=True
('are',) 0.040926640926640924 0.3018867924528302 useful=True
('energy',) 0.040926640926640924 0.2358490566037736 useful=False
('with',) 0.04054054054054054 0.20

In [127]:
for cue, coverage in metrics_bi["coverage"].most_common(25):
    print(cue, coverage, metrics_bi["productivity"][cue], f"useful={metrics_uni['productivity'][cue] > 1/4}")

('of', 'the') 0.06525096525096526 0.15384615384615385 useful=False
('in', 'the') 0.06332046332046332 0.24390243902439024 useful=False
('to', 'the') 0.03513513513513514 0.24175824175824176 useful=False
('amount', 'of') 0.029343629343629343 0.25 useful=False
('from', 'the') 0.027413127413127413 0.2676056338028169 useful=False
('in', 'a') 0.02702702702702703 0.3 useful=False
('on', 'the') 0.025096525096525095 0.2153846153846154 useful=False
('the', 'same') 0.021621621621621623 0.30357142857142855 useful=False
('number', 'of') 0.021235521235521235 0.16363636363636364 useful=False
('the', 'amount') 0.019691119691119693 0.23529411764705882 useful=False
('of', 'a') 0.018532818532818532 0.25 useful=False
('the', 'sun') 0.01776061776061776 0.391304347826087 useful=False
('carbon', 'dioxide') 0.015057915057915058 0.07692307692307693 useful=False
('into', 'the') 0.013513513513513514 0.2571428571428571 useful=False
('of', 'water') 0.013513513513513514 0.17142857142857143 useful=False
('type', 'of'

In [None]:
# CSQA

ds = load_dataset("commonsense_qa")
metrics_uni = calculate_cue_metrics(
    [
        {"choices": ex["choices"]["text"], "target": ex["choices"]["label"].index(ex["answerKey"])}
        for split_ds in [ds["train"], ds["validation"]]
        for ex in split_ds
    ],
    1
)
metrics_bi = calculate_cue_metrics(
    [
        {"choices": ex["choices"]["text"], "target": ex["choices"]["label"].index(ex["answerKey"])}
        for split_ds in [ds["train"], ds["validation"]]
        for ex in split_ds
    ],
    2
)

In [130]:
for cue, coverage in metrics_uni["coverage"].most_common(25):
    print(cue, coverage, metrics_uni["productivity"][cue], f"useful={metrics_uni['productivity'][cue] > 1/5}")

('store',) 0.0652253238460135 0.23636363636363636 useful=True
('house',) 0.06139390622149243 0.1887072808320951 useful=False
('to',) 0.05966064586754242 0.16666666666666666 useful=False
('of',) 0.059295748950921365 0.19384615384615383 useful=False
('in',) 0.045064769202700236 0.1214574898785425 useful=False
('office',) 0.034026637474913336 0.225201072386059 useful=True
('city',) 0.03357051632913702 0.22010869565217392 useful=True
('room',) 0.03247582557927386 0.22752808988764045 useful=True
('school',) 0.03229337712096333 0.1864406779661017 useful=False
('get',) 0.031472359058565956 0.22608695652173913 useful=True
('park',) 0.029556650246305417 0.16049382716049382 useful=False
('home',) 0.028188286808976466 0.20064724919093851 useful=True
('building',) 0.0280970625798212 0.22727272727272727 useful=True
('the',) 0.026637474913336984 0.0 useful=False
('go',) 0.026455026455026454 0.2 useful=False
('have',) 0.025725232621784347 0.24113475177304963 useful=True
('new',) 0.025451559934318555 

In [131]:
for cue, coverage in metrics_bi["coverage"].most_common(25):
    print(cue, coverage, metrics_bi["productivity"][cue], f"useful={metrics_bi['productivity'][cue] > 1/5}")

('go', 'to') 0.012406495165115855 0.19852941176470587 useful=False
('new', 'york') 0.01076445904032111 0.211864406779661 useful=True
('grocery', 'store') 0.010490786352855318 0.2 useful=False
('have', 'fun') 0.00875752599890531 0.25 useful=True
('talk', 'to') 0.007754059478197409 0.058823529411764705 useful=False
('office', 'building') 0.007662835249042145 0.25 useful=True
('friend', 'house') 0.007571611019886882 0.26506024096385544 useful=True
('each', 'other') 0.007571611019886882 0.0963855421686747 useful=False
('neighbor', 'house') 0.0074803867907316185 0.2804878048780488 useful=True
('living', 'room') 0.007297938332421091 0.1875 useful=False
('music', 'store') 0.007297938332421091 0.3375 useful=True
('of', 'earth') 0.007297938332421091 0.2125 useful=True
('surface', 'of') 0.007297938332421091 0.2125 useful=True
('united', 'states') 0.006933041415800037 0.2631578947368421 useful=True
('own', 'home') 0.006933041415800037 0.25 useful=True
('train', 'station') 0.006750592957489509 0.2

In [132]:
# ARCT

ds = list(map(lambda x: x[1], pd.read_csv(os.path.join("..", "data", "arct", "train.csv"), sep="\t").iterrows())) + \
    list(map(lambda x: x[1], pd.read_csv(os.path.join("..", "data", "arct", "dev.csv"), sep="\t").iterrows())) + \
    list(map(lambda x: x[1], pd.read_csv(os.path.join("..", "data", "arct", "test.csv"), sep="\t").iterrows()))

metrics_uni = calculate_cue_metrics(
    [
        {"choices": [ex["warrant0"], ex["warrant1"]], "target": ex["correctLabelW0orW1"]}
        for ex in ds
    ],
    1
)
metrics_bi = calculate_cue_metrics(
    [
        {"choices": [ex["warrant0"], ex["warrant1"]], "target": ex["correctLabelW0orW1"]}
        for ex in ds
    ],
    2
)

In [133]:
for cue, coverage in metrics_uni["coverage"].most_common(25):
    print(cue, coverage, metrics_uni["productivity"][cue], f"useful={metrics_uni['productivity'][cue] > 1/2}")

('not',) 0.3766497461928934 0.5 useful=False
('do',) 0.11776649746192894 0.5 useful=False
('does',) 0.062436548223350256 0.5 useful=False
('can',) 0.06142131979695432 0.5 useful=False
('to',) 0.05583756345177665 0.5 useful=False
('and',) 0.04517766497461929 0.5 useful=False
('no',) 0.04314720812182741 0.5 useful=False
('a',) 0.04010152284263959 0.5 useful=False
('ca',) 0.03654822335025381 0.5 useful=False
('be',) 0.0350253807106599 0.5 useful=False
('more',) 0.03451776649746193 0.5 useful=False
('is',) 0.03299492385786802 0.5 useful=False
('are',) 0.03299492385786802 0.5 useful=False
('will',) 0.029949238578680204 0.5 useful=False
('of',) 0.028426395939086295 0.5 useful=False
('have',) 0.027918781725888325 0.5 useful=False
('should',) 0.027411167512690356 0.5 useful=False
('the',) 0.026903553299492386 0.5 useful=False
('wo',) 0.024873096446700507 0.5 useful=False
('only',) 0.023857868020304568 0.5 useful=False
('still',) 0.02182741116751269 0.5 useful=False
('even',) 0.0213197969543147

In [134]:
for cue, coverage in metrics_bi["coverage"].most_common(25):
    print(cue, coverage, metrics_bi["productivity"][cue], f"useful={metrics_bi['productivity'][cue] > 1/2}")

('is', 'not') 0.09035532994923857 0.5 useful=False
('are', 'not') 0.06802030456852792 0.5 useful=False
('do', 'not') 0.04365482233502538 0.5 useful=False
('can', 'not') 0.03248730964467005 0.5 useful=False
('does', 'not') 0.030456852791878174 0.5 useful=False
('not', 'be') 0.028934010152284265 0.5 useful=False
('is', 'a') 0.025888324873096447 0.5 useful=False
('can', 'be') 0.024365482233502538 0.5 useful=False
('will', 'not') 0.023350253807106598 0.5 useful=False
('not', 'a') 0.02081218274111675 0.5 useful=False
('to', 'be') 0.016751269035532996 0.5 useful=False
('should', 'be') 0.016243654822335026 0.5 useful=False
('not', 'have') 0.015228426395939087 0.5 useful=False
('people', 'do') 0.014720812182741117 0.5 useful=False
('should', 'not') 0.014720812182741117 0.5 useful=False
('they', 'do') 0.013197969543147208 0.5 useful=False
('need', 'to') 0.012690355329949238 0.5 useful=False
('do', 'need') 0.012690355329949238 0.5 useful=False
('not', 'the') 0.011675126903553299 0.5 useful=False

In [135]:
# PIQA

with open(os.path.join("..", "data", "PIQA", "train-labels.lst")) as fr:
    train_y = list(map(lambda x: int(x), fr))

with open(os.path.join("..", "data", "PIQA", "valid-labels.lst")) as fr:
    dev_y = list(map(lambda x: int(x), fr))
    
with open(os.path.join("..", "data", "PIQA", "valid.jsonl")) as fr:
    dev_x = list(map(lambda line: json.loads(line), fr))

with open(os.path.join("..", "data", "PIQA", "train.jsonl")) as fr:
    train_x = list(map(lambda line: json.loads(line), fr))
    
ds = list(zip(train_x, train_y)) + list(zip(dev_x, dev_y))

metrics_uni = calculate_cue_metrics(
    [
        {"choices": [ex[0]["sol1"], ex[0]["sol2"]], "target": ex[1]}
        for ex in ds
    ],
    1
)
metrics_bi = calculate_cue_metrics(
    [
        {"choices": [ex[0]["sol1"], ex[0]["sol2"]], "target": ex[1]}
        for ex in ds
    ],
    2
)

In [136]:
for cue, coverage in metrics_uni["coverage"].most_common(25):
    print(cue, coverage, metrics_uni["productivity"][cue], f"useful={metrics_uni['productivity'][cue] > 1/2}")

('a',) 0.09587209626204668 0.5235328297501453 useful=True
('of',) 0.07275360704139046 0.49617151607963245 useful=False
('to',) 0.06545596345607486 0.49361702127659574 useful=False
('and',) 0.06540025625313353 0.524701873935264 useful=True
('in',) 0.06428611219430673 0.46707105719237435 useful=False
('on',) 0.061389337641357025 0.5254083484573503 useful=True
('the',) 0.05453735167957217 0.39938712972420837 useful=False
('with',) 0.05197482034427051 0.4705251875669882 useful=False
('it',) 0.04746253690602195 0.4753521126760563 useful=False
('water',) 0.04194752381482926 0.5232403718459495 useful=True
('your',) 0.03737953317363935 0.45007451564828616 useful=False
('for',) 0.03153027686479862 0.450530035335689 useful=False
('you',) 0.028689209514790263 0.45048543689320386 useful=False
('paper',) 0.027129407832432735 0.46611909650924027 useful=False
('use',) 0.02612667817948861 0.5501066098081023 useful=True
('an',) 0.02612667817948861 0.5543710021321961 useful=True
('up',) 0.02540248454125

In [137]:
for cue, coverage in metrics_bi["coverage"].most_common(25):
    print(cue, coverage, metrics_bi["productivity"][cue], f"useful={metrics_bi['productivity'][cue] > 1/2}")

('in', 'the') 0.03197593448832934 0.4146341463414634 useful=False
('on', 'the') 0.028912038326555624 0.5375722543352601 useful=True
('of', 'the') 0.02835496629714222 0.5029469548133595 useful=True
('with', 'a') 0.025681020555957886 0.46637744034707157 useful=False
('use', 'a') 0.022505709988301488 0.5123762376237624 useful=True
('to', 'the') 0.020110300261823855 0.46814404432132967 useful=False
('in', 'a') 0.0175477689265222 0.5015873015873016 useful=True
('and', 'then') 0.01604367444710601 0.4270833333333333 useful=False
('into', 'the') 0.013648264720628377 0.5224489795918368 useful=True
('top', 'of') 0.013314021502980335 0.4476987447698745 useful=False
('the', 'top') 0.013258314300038996 0.47478991596638653 useful=False
('on', 'a') 0.01075149016767868 0.5699481865284974 useful=True
('the', 'bottom') 0.010472954152971978 0.526595744680851 useful=True
('from', 'the') 0.010417246950030639 0.4385026737967914 useful=False
('water', 'and') 0.009525931702969195 0.5497076023391813 useful=Tru