In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, roc_auc_score

## Pile-RS-Truncated

In [9]:
seq_lens = [64, 128, 256, 512, 1024, 2048]
base_path = "../data/pile/pile_rs_truncated"

In [2]:
def evaluate_bow_shift(df, n_runs=5):
    texts = df['truncated_text'].values
    labels = df['label'].values
    aucs = []

    sss = StratifiedShuffleSplit(n_splits=n_runs, test_size=0.2, random_state=42)

    for train_idx, test_idx in sss.split(texts, labels):
        X_train_texts, X_test_texts = texts[train_idx], texts[test_idx]
        y_train, y_test = labels[train_idx], labels[test_idx]

        # BoW vectorizer: keep 1-grams in ≥5% of training sequences
        vectorizer = CountVectorizer(ngram_range=(1, 1), min_df=0.05)
        X_train = vectorizer.fit_transform(X_train_texts)
        X_test = vectorizer.transform(X_test_texts)

        clf = RandomForestClassifier(
            n_estimators=500,
            max_depth=2,
            min_samples_leaf=10,
            random_state=42
        )
        clf.fit(X_train, y_train)
        # y_pred = clf.predict(X_test)

        # acc = accuracy_score(y_test, y_pred)
        # accs.append(acc)
        y_scores = clf.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_scores)
        aucs.append(auc)

    mean_auc = np.mean(aucs)
    std_auc = np.std(aucs)
    print(f"Average AUC over {n_runs} runs: {mean_auc:.4f} ± {std_auc:.4f}")
    return aucs

### Wikipedia

In [None]:
wiki_data = {}
wiki_accs = {}

for seqlen in seq_lens:
    with open(f"{base_path}/wiki/pile_reservoirv2_wiki_seqlen{seqlen}.pkl", "rb") as f:
        wiki_data[seqlen] = pd.read_pickle(f)

    print('=====================')
    print(f'seq_len = {seqlen}')
    wiki_accs[seqlen] = evaluate_bow_shift(wiki_data[seqlen])
    print(f"Accuracy for seqlen {seqlen}:", wiki_accs[seqlen])
    print()


seq_len = 64
Average AUC over 5 runs: 0.5011 ± 0.0213
Accuracy for seqlen 64: [np.float64(0.512575), np.float64(0.46648750000000005), np.float64(0.487525), np.float64(0.5257375), np.float64(0.513075)]

seq_len = 128
Average AUC over 5 runs: 0.4992 ± 0.0192
Accuracy for seqlen 128: [np.float64(0.5222249999999999), np.float64(0.47157499999999997), np.float64(0.496225), np.float64(0.51915), np.float64(0.48692500000000005)]

seq_len = 256
Average AUC over 5 runs: 0.5472 ± 0.0252
Accuracy for seqlen 256: [np.float64(0.5465), np.float64(0.5254249999999999), np.float64(0.51815), np.float64(0.55665), np.float64(0.5892999999999999)]

seq_len = 512
Average AUC over 5 runs: 0.4917 ± 0.0265
Accuracy for seqlen 512: [np.float64(0.46072500000000005), np.float64(0.4678500000000001), np.float64(0.504575), np.float64(0.4913), np.float64(0.534275)]

seq_len = 1024
Average AUC over 5 runs: 0.4904 ± 0.0141
Accuracy for seqlen 1024: [np.float64(0.51095), np.float64(0.46797500000000003), np.float64(0.485200

### PubMed Central

In [10]:
pubmedc_data = {}
pubmedc_accs = {}

for seqlen in seq_lens:
    with open(f"{base_path}/pubmedc/pile_reservoirv2_pubmedc_seqlen{seqlen}.pkl", "rb") as f:
        pubmedc_data[seqlen] = pd.read_pickle(f)

    print('=====================')
    print(f'seq_len = {seqlen}')
    pubmedc_accs[seqlen] = evaluate_bow_shift(pubmedc_data[seqlen])
    print(f"Accuracy for seqlen {seqlen}:", pubmedc_accs[seqlen])
    print()

seq_len = 64
Average AUC over 5 runs: 0.4807 ± 0.0209
Accuracy for seqlen 64: [np.float64(0.48316250000000005), np.float64(0.4769), np.float64(0.45883749999999995), np.float64(0.51895), np.float64(0.46577500000000005)]

seq_len = 128
Average AUC over 5 runs: 0.5053 ± 0.0164
Accuracy for seqlen 128: [np.float64(0.5225), np.float64(0.514525), np.float64(0.474475), np.float64(0.50995), np.float64(0.5050749999999999)]

seq_len = 256
Average AUC over 5 runs: 0.5126 ± 0.0256
Accuracy for seqlen 256: [np.float64(0.545575), np.float64(0.50975), np.float64(0.467325), np.float64(0.5179), np.float64(0.5223249999999999)]

seq_len = 512
Average AUC over 5 runs: 0.5130 ± 0.0227
Accuracy for seqlen 512: [np.float64(0.5486499999999999), np.float64(0.4955), np.float64(0.498375), np.float64(0.49151249999999996), np.float64(0.5308)]

seq_len = 1024
Average AUC over 5 runs: 0.5043 ± 0.0286
Accuracy for seqlen 1024: [np.float64(0.50925), np.float64(0.505225), np.float64(0.501725), np.float64(0.457625000000

### USPTO

In [11]:
uspto_data = {}
uspto_accs = {}

for seqlen in seq_lens:
    with open(f"{base_path}/uspto/pile_reservoirv2_uspto_seqlen{seqlen}.pkl", "rb") as f:
        uspto_data[seqlen] = pd.read_pickle(f)

    print('=====================')
    print(f'seq_len = {seqlen}')
    uspto_accs[seqlen] = evaluate_bow_shift(uspto_data[seqlen])
    print(f"Accuracy for seqlen {seqlen}:", uspto_accs[seqlen])
    print()

seq_len = 64
Average AUC over 5 runs: 0.4956 ± 0.0323
Accuracy for seqlen 64: [np.float64(0.501325), np.float64(0.54805), np.float64(0.5043375), np.float64(0.45515000000000005), np.float64(0.46904999999999997)]

seq_len = 128
Average AUC over 5 runs: 0.4803 ± 0.0192
Accuracy for seqlen 128: [np.float64(0.48744999999999994), np.float64(0.498375), np.float64(0.4951625), np.float64(0.4454), np.float64(0.47507499999999997)]

seq_len = 256
Average AUC over 5 runs: 0.5011 ± 0.0158
Accuracy for seqlen 256: [np.float64(0.4716), np.float64(0.510375), np.float64(0.49806249999999996), np.float64(0.5106999999999999), np.float64(0.5146999999999999)]

seq_len = 512
Average AUC over 5 runs: 0.5282 ± 0.0170
Accuracy for seqlen 512: [np.float64(0.5115500000000001), np.float64(0.5583), np.float64(0.5296125), np.float64(0.51205), np.float64(0.529525)]

seq_len = 1024
Average AUC over 5 runs: 0.5193 ± 0.0121
Accuracy for seqlen 1024: [np.float64(0.52115), np.float64(0.513275), np.float64(0.518275), np.flo

### Pile-CC

In [None]:
pilecc_data = {}
pilecc_accs = {}

for seqlen in seq_lens:
    with open(f"{base_path}/pilecc/pile_reservoirv2_pilecc_seqlen{seqlen}.pkl", "rb") as f:
        pilecc_data[seqlen] = pd.read_pickle(f)

    print('=====================')
    print(f'seq_len = {seqlen}')
    pilecc_accs[seqlen] = evaluate_bow_shift(pilecc_data[seqlen])
    print(f"Accuracy for seqlen {seqlen}:", pilecc_accs[seqlen])
    print()


seq_len = 64
Average AUC over 5 runs: 0.5112 ± 0.0171
Accuracy for seqlen 64: [np.float64(0.49369999999999997), np.float64(0.5180750000000001), np.float64(0.5386375), np.float64(0.5131), np.float64(0.49242500000000006)]

seq_len = 128
Average AUC over 5 runs: 0.5139 ± 0.0272
Accuracy for seqlen 128: [np.float64(0.4635250000000001), np.float64(0.521725), np.float64(0.54445), np.float64(0.51325), np.float64(0.526675)]

seq_len = 256
Average AUC over 5 runs: 0.4829 ± 0.0181
Accuracy for seqlen 256: [np.float64(0.45714999999999995), np.float64(0.48607500000000003), np.float64(0.5093000000000001), np.float64(0.49249999999999994), np.float64(0.46952499999999997)]

seq_len = 512
Average AUC over 5 runs: 0.4625 ± 0.0265
Accuracy for seqlen 512: [np.float64(0.49085), np.float64(0.47330000000000005), np.float64(0.41275000000000006), np.float64(0.47364999999999996), np.float64(0.461725)]

seq_len = 1024
Average AUC over 5 runs: 0.4864 ± 0.0147
Accuracy for seqlen 1024: [np.float64(0.484025), np.f

### Github

In [None]:
github_data = {}
github_accs = {}

for seqlen in seq_lens:
    with open(f"{base_path}/github/pile_reservoirv2_github_seqlen{seqlen}.pkl", "rb") as f:
        github_data[seqlen] = pd.read_pickle(f)

    print('=====================')
    print(f'seq_len = {seqlen}')
    github_accs[seqlen] = evaluate_bow_shift(github_data[seqlen])
    print(f"Accuracy for seqlen {seqlen}:", github_accs[seqlen])
    print()

seq_len = 64
Average AUC over 5 runs: 0.5299 ± 0.0348
Accuracy for seqlen 64: [np.float64(0.472), np.float64(0.5692), np.float64(0.5114500000000001), np.float64(0.5401875), np.float64(0.5568750000000001)]

seq_len = 128
Average AUC over 5 runs: 0.5106 ± 0.0455
Accuracy for seqlen 128: [np.float64(0.4564375), np.float64(0.5491125), np.float64(0.528175), np.float64(0.45643750000000005), np.float64(0.562625)]

seq_len = 256
Average AUC over 5 runs: 0.4827 ± 0.0383
Accuracy for seqlen 256: [np.float64(0.4560125), np.float64(0.5425875), np.float64(0.464575), np.float64(0.4390875), np.float64(0.5111375)]

seq_len = 512
Average AUC over 5 runs: 0.4805 ± 0.0390
Accuracy for seqlen 512: [np.float64(0.44196250000000004), np.float64(0.54525), np.float64(0.4649375), np.float64(0.446725), np.float64(0.503775)]

seq_len = 1024
Average AUC over 5 runs: 0.4832 ± 0.0380
Accuracy for seqlen 1024: [np.float64(0.4266), np.float64(0.539375), np.float64(0.4936125), np.float64(0.4594875), np.float64(0.496712