In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../")

In [3]:
import pickle
import numpy as np
from tqdm.auto import trange, tqdm
from features import get_transformer, merge_entries
import json
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from utills import chunker, get_num_chunks, cartesian_product
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import RandomizedSearchCV


In [4]:
import matplotlib.pyplot as plt
import matplotlib.style as style
import tikzplotlib
%matplotlib inline

In [5]:
from plotly.offline import init_notebook_mode
import plotly.offline as py
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [6]:
DATA_DIR = '../data/pan/'
GROUND_TRUTH_PATH = DATA_DIR + 'pan20-authorship-verification-training-large-truth.jsonl'
DATA_PATH = DATA_DIR + 'pan20-authorship-verification-training-large.jsonl'
PREPROCESSED_DATA_PATH = '../temp_data/pan/'
TEMP_DATA_PATH = '../temp_data/pan/split_models/'
FIGURES_PATH = '../figures/'

In [7]:
def fit_transformers(chunk_sz, data_fraction=0.01):
    docs_1 = []
    docs_2 = []
    author_bounds_1 = {}
    author_bounds_2 = {}
    i = 0
    with open(PREPROCESSED_DATA_PATH + 'preprocessed_train.jsonl', 'r') as f:
        for l in tqdm(f):
            if np.random.rand() < data_fraction:
                d = json.loads(l)
                docs_merged = [merge_entries(c) for c in list(chunker(d['pair'][0], chunk_sz))]
                author_bounds_1[i] = (len(docs_1), len(docs_1) + len(docs_merged))
                docs_1.extend(docs_merged)
                
                docs_merged = [merge_entries(c) for c in list(chunker(d['pair'][1], chunk_sz))]
                author_bounds_2[i] = (len(docs_2), len(docs_2) + len(docs_merged))
                docs_2.extend(docs_merged)
                i += 1

    num_recs = i            
    transformer = get_transformer()
    scaler = StandardScaler()
    secondary_scaler = StandardScaler()

    X = transformer.fit_transform(docs_1 + docs_2).todense()
    X = scaler.fit_transform(X)
    X1 = X[:len(docs_1)]
    X2 = X[len(docs_1):]
    
    XX = []
    for i in trange(num_recs):
        s_1, e_1 = author_bounds_1[i]
        s_2, e_2 = author_bounds_2[i]
        
        idxs = cartesian_product(range(s_1, e_1), range(s_2, e_2))
        XX.extend(np.abs(X1[idxs[:, 0], :] - X2[idxs[:, 1], :]))
    
    secondary_scaler.fit(XX)
    
    return transformer, scaler, secondary_scaler

def vectorize(vectorized_x_path, vectorized_y_path, ordered_idxs, ground_truth, transformer, scaler, secondary_scaler, preprocessed_path, vector_sz, chunk_sz):
    author_bounds = {}
    total_recs = 0
    with open(preprocessed_path, 'r') as f:
        i = 0;
        for l in tqdm(f, total=vector_sz):
            d = json.loads(l)
            n_doc1_chunks = get_num_chunks(d['pair'][0], chunk_sz)
            n_doc2_chunks = get_num_chunks(d['pair'][1], chunk_sz)
            author_bounds[i] = (total_recs, total_recs + n_doc1_chunks * n_doc2_chunks)
            
            total_recs += n_doc1_chunks * n_doc2_chunks
            i += 1
    
    # Shuffle author_bounds mapping
    shuffled_author_bounds = {}
    temp = {}
    b = 0
    for i in range(len(ordered_idxs)):
        s, e = author_bounds[ordered_idxs[i]]
        temp[i] = (b, b + (e - s))
        b += (e - s)

    for i in range(len(ordered_idxs)):
        shuffled_author_bounds[ordered_idxs[i]] = temp[i]
    
    x_shape = (total_recs, len(transformer.get_feature_names()))    
    XX = np.memmap(vectorized_x_path, dtype='float32', mode='w+', shape=x_shape)
    Y = np.memmap(vectorized_y_path, dtype='int32', mode='w+', shape=(total_recs))
    
    with open(preprocessed_path, 'r') as f:
        i = 0;
        for l in tqdm(f, total=vector_sz):
            try:
                d = json.loads(l)
            except:
                break


            s, e = shuffled_author_bounds[i]
            docs_merged_1 = [merge_entries(c) for c in list(chunker(d['pair'][0], chunk_sz))]
            docs_merged_2 = [merge_entries(c) for c in list(chunker(d['pair'][1], chunk_sz))]

            X_1 = scaler.transform(transformer.transform(docs_merged_1).todense())
            X_2 = scaler.transform(transformer.transform(docs_merged_2).todense())
            c_idxs = cartesian_product(range(len(X_1)), range(len(X_2)))

            XX[s:e, :] = secondary_scaler.transform(np.abs(X_1[c_idxs[:, 0], :] - X_2[c_idxs[:, 1], :]))
            Y[s:e] = ground_truth[d['id']]

            i += 1
    
    XX.flush()
    Y.flush()

In [8]:
ground_truth = {}
with open(GROUND_TRUTH_PATH, 'r') as f:
    for l in f:
        d = json.loads(l)
        ground_truth[d['id']] = d['same']

In [9]:
train_sz = 193536
test_sz = 81963

# with open(PREPROCESSED_DATA_PATH + 'preprocessed_train.jsonl', 'r') as f:
#     for l in f:
#         train_sz += 1

# with open(PREPROCESSED_DATA_PATH + 'preprocessed_test.jsonl', 'r') as f:
#     for l in f:
#         test_sz += 1

print('Train Sz:', train_sz, flush=True)
print('Test Sz:', test_sz, flush=True)

Train Sz: 193536
Test Sz: 81963


Visualize Data Stats
===

In [15]:
docs = []
lines_per_docs = []
data_fraction = 0.001
with open(PREPROCESSED_DATA_PATH + 'preprocessed_train.jsonl', 'r') as f:
    for l in tqdm(f):
        if np.random.rand() < data_fraction:
            d = json.loads(l)
            docs.extend(d['pair'][0])
            docs.extend(d['pair'][1])
            lines_per_docs.append(len(d['pair'][0]))
            lines_per_docs.append(len(d['pair'][1]))




In [16]:
# Lines per doc
fig = go.Figure()
fig.add_trace(go.Histogram(x=lines_per_docs))

In [17]:
# Tokens per line
fig = go.Figure()
fig.add_trace(go.Histogram(x=[len(l['tokens']) for l in docs]))

Fit Transofrmers
===

In [10]:
print('Fitting transformer...', flush=True)
chunk_sz = 5 # Gives on avg the same number of tokens as Reddit experiments
transformer, scaler, secondary_scaler = fit_transformers(data_fraction=0.1, chunk_sz=chunk_sz)
feature_sz = len(transformer.get_feature_names())

Fitting transformer...






The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



MemoryError: 

In [11]:
with open(TEMP_DATA_PATH + 'model_' + str(chunk_sz) + '.p', 'wb') as f:
    pickle.dump((transformer, scaler, secondary_scaler, chunk_sz), f)

In [None]:
print('Vectorizing train set...', flush=True)
train_idxs = np.array(range(train_sz))
np.random.shuffle(train_idxs)

vectorize(
    TEMP_DATA_PATH + 'split_vectorized_XX_train_' + str(chunk_sz) + '.npy', 
    TEMP_DATA_PATH + 'split_Y_train_' + str(chunk_sz) + '.npy', 
    train_idxs, 
    ground_truth,
    transformer, 
    scaler, 
    secondary_scaler, 
    PREPROCESSED_DATA_PATH + 'preprocessed_train.jsonl',
    train_sz,
    chunk_sz
)

In [None]:
print('Vectorizing test set...', flush=True)

test_idxs = np.array(range(test_sz))
np.random.shuffle(test_idxs)

vectorize(
    TEMP_DATA_PATH + 'split_vectorized_XX_test_' + str(chunk_sz) + '.npy', 
    TEMP_DATA_PATH + 'split_Y_test_' + str(chunk_sz) + '.npy', 
    test_idxs, 
    ground_truth,
    transformer, 
    scaler, 
    secondary_scaler, 
    PREPROCESSED_DATA_PATH + 'preprocessed_test.jsonl',
    test_sz,
    chunk_sz
)


In [None]:
with open(TEMP_DATA_PATH + 'experiment_data_' + str(chunk_sz) + '.p', 'wb') as f:
    pickle.dump((
        transformer, 
        scaler,
        secondary_scaler,
        feature_sz,
        train_sz,
        train_idxs,
        test_sz,
        test_idxs
    ), f)

In [15]:
XX_train = np.memmap(TEMP_DATA_PATH + 'split_vectorized_XX_train_' + str(chunk_sz) + '.npy', dtype='float32', mode='r', shape=(train_sz, feature_sz))
Y_train = np.memmap(TEMP_DATA_PATH + 'split_Y_train_' + str(chunk_sz) + '.npy', dtype='int32', mode='r', shape=(train_sz))

XX_test = np.memmap(TEMP_DATA_PATH + 'split_vectorized_XX_test_' + str(chunk_sz) + '.npy', dtype='float32', mode='r', shape=(test_sz, feature_sz))
Y_test = np.memmap(TEMP_DATA_PATH + 'split_Y_test_' + str(chunk_sz) + '.npy', dtype='int32', mode='r', shape=(test_sz))

In [16]:
print('Training classifier...', flush=True)
clf = SGDClassifier(loss='log', alpha=0.01)
batch_size=50000
num_epochs = 50
aucs = []
for i in trange(num_epochs):
    print('Epoch - ', i)
    print('-' * 30)
    for idxs in chunker(range(train_sz), batch_size):
        clf.partial_fit(XX_train[idxs, :], Y_train[idxs], classes=[0, 1])

    probs = clf.predict_proba(XX_test)[:, 1]
    fpr, tpr, thresh = roc_curve(Y_test, probs)
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    print('AUC: ', roc_auc)
    with open(TEMP_DATA_PATH + 'experiment_data_' + str(chunk_sz) + '.p', 'wb') as f:
        pickle.dump((
            aucs,
            clf,
            roc_auc,
            transformer, 
            scaler,
            secondary_scaler,
            feature_sz,
            train_sz,
            train_idxs,
            test_sz,
            test_idxs
        ), f)

Training classifier...


Epoch -  0
------------------------------
AUC:  0.7403380192915994
Epoch -  1
------------------------------
AUC:  0.7600267969726189
Epoch -  2
------------------------------
AUC:  0.769572942321436
Epoch -  3
------------------------------
AUC:  0.7738850796145547
Epoch -  4
------------------------------
AUC:  0.7781455536576457
Epoch -  5
------------------------------
AUC:  0.7802499332824864
Epoch -  6
------------------------------
AUC:  0.7854119759891844
Epoch -  7
------------------------------
AUC:  0.785655318287598
Epoch -  8
------------------------------
AUC:  0.787784972749062
Epoch -  9
------------------------------
AUC:  0.7895081015623644
Epoch -  10
------------------------------
AUC:  0.790290801598309
Epoch -  11
------------------------------
AUC:  0.7903444282915744
Epoch -  12
------------------------------
AUC:  0.7914382723485067
Epoch -  13
------------------------------
AUC:  0.7922568645583628
Epoch -  14
------------------------------
AUC:  0.79211909808

In [18]:
# Chunk_sz=5
go.Figure(go.Scatter(
    x=np.arange(len(aucs)),
    y=aucs
))

In [19]:
go.Figure(go.Scatter(
    x=np.arange(len(aucs)),
    y=aucs
))

In [37]:
go.Figure(go.Scatter(
    x=np.arange(len(aucs)),
    y=aucs
))

In [18]:
chunk_sz

15

In [19]:
with open(TEMP_DATA_PATH + 'model_' + str(chunk_sz) + '.p', 'wb') as f:
    pickle.dump((clf, transformer, scaler, secondary_scaler), f)

In [53]:
labels = []

inter_probs_mean = []
inter_probs_std = []

intraA_probs_mean = []
intraA_probs_std = []

intraB_probs_mean = []
intraB_probs_std = []
pred_lengths = []

with open(PREPROCESSED_DATA_PATH + 'preprocessed_test.jsonl', 'r') as f:
    for l in tqdm(f, total=test_sz):
        d = json.loads(l)
        l = []
        
        docs_merged_1 = [merge_entries(c) for c in list(chunker(d['pair'][0], chunk_sz))]
        docs_merged_2 = [merge_entries(c) for c in list(chunker(d['pair'][1], chunk_sz))]

        X_1 = scaler.transform(transformer.transform(docs_merged_1).todense())
        X_2 = scaler.transform(transformer.transform(docs_merged_2).todense())

        idxs = cartesian_product(range(len(X_1)), range(len(X_2)))
        
        x_diff = secondary_scaler.transform(np.abs(X_1[idxs[:, 0]] - X_2[idxs[:, 1]]))
        x_diff[np.isnan(x_diff)]=0
        p = clf.predict_proba(x_diff)[:, 1]
        
        inter_probs_mean.append(p.mean())
        inter_probs_std.append(p.std())
        l.append(len(p))
        
        
        idxs = cartesian_product(range(len(X_1)), range(len(X_1)))
        idxs = np.array([(i, j) for i, j in idxs if i != j])
        if len(idxs) > 0:
            x_diff = secondary_scaler.transform(np.abs(X_1[idxs[:, 0]] - X_1[idxs[:, 1]]))
            x_diff[np.isnan(x_diff)]=0
            p = clf.predict_proba(x_diff)[:, 1]
            intraA_probs_mean.append(p.mean())
            intraA_probs_std.append(p.std())
            l.append(len(p))
        else:
            intraA_probs_mean.append(np.nan)
            intraA_probs_std.append(np.nan)
            l.append(0)
        
        idxs = cartesian_product(range(len(X_2)), range(len(X_2)))
        idxs = np.array([(i, j) for i, j in idxs if i != j])
        if len(idxs) > 0:
            x_diff = secondary_scaler.transform(np.abs(X_2[idxs[:, 0]] - X_2[idxs[:, 1]]))
            x_diff[np.isnan(x_diff)]=0
            p = clf.predict_proba(x_diff)[:, 1]
            intraB_probs_mean.append(p.mean())
            intraB_probs_std.append(p.std())
            l.append(len(p))
        else:
            intraB_probs_mean.append(np.nan)
            intraB_probs_std.append(np.nan)
            l.append(0)
            
        pred_lengths.append(l)
        labels.append(ground_truth[d['id']])


            




In [49]:
inter_probs_mean

array([0.99952666, 0.99952666, 0.99952666, ..., 0.99952666, 0.99952666,
       0.99952666])

In [54]:
inter_probs_mean = np.array(inter_probs_mean)
intraA_probs_mean = np.array(intraA_probs_mean)
intraB_probs_mean = np.array(intraB_probs_mean)
inter_probs_std = np.array(inter_probs_std)
intraA_probs_std = np.array(intraA_probs_std)
intraB_probs_std = np.array(intraB_probs_std)
labels = np.array(labels)
pred_lengths = np.array(pred_lengths)
intraA_probs_mean[np.isnan(intraA_probs_mean)] = 0.5
intraA_probs_std[np.isnan(intraA_probs_std)] = 0.5

intraB_probs_mean[np.isnan(intraB_probs_mean)] = 0.5
intraB_probs_std[np.isnan(intraB_probs_std)] = 0.5




n_a = pred_lengths[:, 0]
n_b = pred_lengths[:, 1]
n_ab = pred_lengths[:, 2]

intra_probs_mean = (intraA_probs_mean * n_a + intraB_probs_mean * n_b)/ (n_a + n_b)
intra_probs_std = (
        n_a * (intraA_probs_std ** 2 + (intraA_probs_mean - intra_probs_mean)**2) + 
        n_b * (intraB_probs_std ** 2 + (intraB_probs_mean - intra_probs_mean)**2)
    ) / (n_a + n_b)


pooled_mean = (intra_probs_mean * (n_a + n_b) + inter_probs_mean * n_ab)/ (n_a + n_b + n_ab)
pooled_std = (
        (n_a + n_b) * (intra_probs_mean ** 2 + (intra_probs_mean - pooled_mean)**2) + 
        n_ab * (inter_probs_mean ** 2 + (inter_probs_mean - pooled_mean)**2)
    ) / (n_a + n_b + n_ab)

In [55]:
# Chunk Size = 30
fpr, tpr, thresh = roc_curve(labels, inter_probs_mean)
roc_auc = auc(fpr, tpr)
print('Inter prob mean diff AUC:', round(roc_auc, 6))

pp = np.abs(inter_probs_mean - intra_probs_mean)/np.sqrt(inter_probs_std**2/n_ab + intra_probs_std**2/(n_a+n_b))
pp[np.isnan(pp)] = 0.5
fpr, tpr, thresh = roc_curve(labels, pp)
roc_auc = 1 - auc(fpr, tpr)
print('Z Score diff', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, (1 - np.abs(inter_probs_mean - intra_probs_mean)))
roc_auc = auc(fpr, tpr)
print('Abs mean diff AUC', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, (1 - 2 * np.abs(inter_probs_mean - intra_probs_mean)/(pooled_std)))
roc_auc = auc(fpr, tpr)
print('Abs mean diff / pooled std AUC', round(roc_auc, 3))

Inter prob mean diff AUC: 0.956919
Z Score diff 0.918
Abs mean diff AUC 0.954
Abs mean diff / pooled std AUC 0.954



divide by zero encountered in true_divide


invalid value encountered in true_divide

