In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../")

In [2]:
import pickle
import json
import numpy as np
from features import merge_entries
from utills import chunker, cartesian_product
from tqdm.auto import trange, tqdm
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc

In [3]:
TEMP_DATA_PATH = '../temp_data/pan/'
RESULTS_PATH = '../temp_data/pan/combined_results/'
PREPROCESSED_DATA_PATH = '../temp_data/pan/'
DATA_DIR = '../data/pan/'
GROUND_TRUTH_PATH = DATA_DIR + 'pan20-authorship-verification-training-large-truth.jsonl'
# chunk_sz=30
chunk_sz=5
test_sz = 81963

In [4]:
# Full, unchunked model
with open(TEMP_DATA_PATH + 'model.p', 'rb') as f:
    (clf_nc, transformer_nc, scaler_nc, secondary_scaler_nc) = pickle.load(f)
    
# Limitted, unchunked model
# with open(TEMP_DATA_PATH + 'model_limitted_data.p', 'rb') as f:
#     (clf_nc, transformer_nc, scaler_nc, secondary_scaler_nc) = pickle.load(f)

In [5]:
# # Chunked model
# with open(TEMP_DATA_PATH + 'split_models/model.p', 'rb') as f:
#     (clf, transformer, scaler, secondary_scaler) = pickle.load(f)
    
    
# # Chunked model
# with open(TEMP_DATA_PATH + 'split_models/model_15.p', 'rb') as f:
#     (clf, transformer, scaler, secondary_scaler) = pickle.load(f)

# Chunked model
with open(TEMP_DATA_PATH + 'split_models/model_5.p', 'rb') as f:
    (clf, transformer, scaler, secondary_scaler) = pickle.load(f)
    
# Chunked model, limitted data
# with open(TEMP_DATA_PATH + 'split_models/model_5_limitted_data.p', 'rb') as f:
#     (clf, transformer, scaler, secondary_scaler) = pickle.load(f)
    
# Chunked model, limitted data
# with open(TEMP_DATA_PATH + 'split_models/model_10_limitted_data.p', 'rb') as f:
#     (clf, transformer, scaler, secondary_scaler) = pickle.load(f)

In [6]:
ground_truth = {}
with open(GROUND_TRUTH_PATH, 'r') as f:
    for l in f:
        d = json.loads(l)
        ground_truth[d['id']] = d['same']

In [7]:
labels = []

probs_nc = []

inter_probs_mean = []
inter_probs_std = []

intraA_probs_mean = []
intraA_probs_std = []

intraB_probs_mean = []
intraB_probs_std = []
pred_lengths = []

with open(PREPROCESSED_DATA_PATH + 'preprocessed_test.jsonl', 'r') as f:
    for l in tqdm(f, total=test_sz):
#         if np.random.rand() > 0.1:
#             continue
        
        d = json.loads(l)
        l = []
        
        docs_merged_1 = [merge_entries(c) for c in list(chunker(d['pair'][0], chunk_sz))]
        docs_merged_2 = [merge_entries(c) for c in list(chunker(d['pair'][1], chunk_sz))]

        X_1 = scaler.transform(transformer.transform(docs_merged_1).todense())
        X_2 = scaler.transform(transformer.transform(docs_merged_2).todense())

        idxs = cartesian_product(range(len(X_1)), range(len(X_2)))
        
        x_diff = secondary_scaler.transform(np.abs(X_1[idxs[:, 0]] - X_2[idxs[:, 1]]))
        x_diff[np.isnan(x_diff)]=0
        p = clf.predict_proba(x_diff)[:, 1]
        
        inter_probs_mean.append(p.mean())
        inter_probs_std.append(p.std())
        l.append(len(p))
        
        
        idxs = cartesian_product(range(len(X_1)), range(len(X_1)))
        idxs = np.array([(i, j) for i, j in idxs if i != j])
        if len(idxs) > 0:
            x_diff = secondary_scaler.transform(np.abs(X_1[idxs[:, 0]] - X_1[idxs[:, 1]]))
            x_diff[np.isnan(x_diff)]=0
            p = clf.predict_proba(x_diff)[:, 1]
            intraA_probs_mean.append(p.mean())
            intraA_probs_std.append(p.std())
            l.append(len(p))
        else:
            intraA_probs_mean.append(np.nan)
            intraA_probs_std.append(np.nan)
            l.append(0)
        
        idxs = cartesian_product(range(len(X_2)), range(len(X_2)))
        idxs = np.array([(i, j) for i, j in idxs if i != j])
        if len(idxs) > 0:
            x_diff = secondary_scaler.transform(np.abs(X_2[idxs[:, 0]] - X_2[idxs[:, 1]]))
            x_diff[np.isnan(x_diff)]=0
            p = clf.predict_proba(x_diff)[:, 1]
            intraB_probs_mean.append(p.mean())
            intraB_probs_std.append(p.std())
            l.append(len(p))
        else:
            intraB_probs_mean.append(np.nan)
            intraB_probs_std.append(np.nan)
            l.append(0)
            
        pred_lengths.append(l)
        
        
        
        labels.append(ground_truth[d['id']])

        X_1 = scaler_nc.transform(transformer_nc.transform([merge_entries(d['pair'][0])]).todense())
        X_2 = scaler_nc.transform(transformer_nc.transform([merge_entries(d['pair'][1])]).todense())
        p = clf_nc.predict_proba(secondary_scaler_nc.transform(np.abs(X_1 - X_2)))[0, 1]
        probs_nc.append(p)
            




In [9]:

inter_probs_mean = np.array(inter_probs_mean)
intraA_probs_mean = np.array(intraA_probs_mean)
intraB_probs_mean = np.array(intraB_probs_mean)
inter_probs_std = np.array(inter_probs_std)
intraA_probs_std = np.array(intraA_probs_std)
intraB_probs_std = np.array(intraB_probs_std)
labels = np.array(labels)
pred_lengths = np.array(pred_lengths)


probs_nc = np.array(probs_nc)

n_a = pred_lengths[:, 0]
n_b = pred_lengths[:, 1]
n_ab = pred_lengths[:, 2]

intra_probs_mean = (intraA_probs_mean * n_a + intraB_probs_mean * n_b)/ (n_a + n_b)
intra_probs_std = (
        n_a * (intraA_probs_std ** 2 + (intraA_probs_mean - intra_probs_mean)**2) + 
        n_b * (intraB_probs_std ** 2 + (intraB_probs_mean - intra_probs_mean)**2)
    ) / (n_a + n_b)


pooled_mean = (intra_probs_mean * (n_a + n_b) + inter_probs_mean * n_ab)/ (n_a + n_b + n_ab)
pooled_std = (
        (n_a + n_b) * (intra_probs_mean ** 2 + (intra_probs_mean - pooled_mean)**2) + 
        n_ab * (inter_probs_mean ** 2 + (inter_probs_mean - pooled_mean)**2)
    ) / (n_a + n_b + n_ab)

In [10]:
intra_probs_mean[np.isnan(intra_probs_mean)] = 0.5
pooled_std[np.isnan(pooled_std)] = 0.5
pooled_mean[np.isnan(pooled_mean)] = 0.5

In [11]:
chunk_sz

5

In [14]:
# Chunk Sz 5, full data

10

In [16]:
# Chunk Sz 10, full data
fpr, tpr, thresh = roc_curve(labels, inter_probs_mean)
roc_auc = auc(fpr, tpr)
print('Inter prob mean AUC:', round(roc_auc, 6))

pp = np.abs(inter_probs_mean - intra_probs_mean)/np.sqrt(inter_probs_std**2/n_ab + intra_probs_std**2/(n_a+n_b))
pp[np.isnan(pp)] = 0.5
fpr, tpr, thresh = roc_curve(labels, pp)
roc_auc = 1 - auc(fpr, tpr)
print('Z Score diff', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, (1 - np.abs(inter_probs_mean - intra_probs_mean)))
roc_auc = auc(fpr, tpr)
print('Abs mean diff AUC', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, (1 - 2 * np.abs(inter_probs_mean - intra_probs_mean)/(pooled_std)))
roc_auc = auc(fpr, tpr)
print('Abs mean diff / pooled std AUC', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, probs_nc)
roc_auc = auc(fpr, tpr)
print('Unchunked AUC:', roc_auc)

fpr, tpr, thresh = roc_curve(labels, 0.5 * (probs_nc + (1 - np.abs(inter_probs_mean - intra_probs_mean))))
roc_auc = auc(fpr, tpr)
print('Unchunked + Inter-Intra diff:', roc_auc)


fpr, tpr, thresh = roc_curve(labels, (probs_nc * (1 - np.abs(inter_probs_mean - intra_probs_mean))))
roc_auc = auc(fpr, tpr)
print('Unchunked * Inter-Intra diff:', roc_auc)

with open(RESULTS_PATH + 'results_' + str(chunk_sz) + '.p', 'wb') as f:
    pickle.dump((
        inter_probs_mean,
        intraA_probs_mean,
        intraB_probs_mean,
        inter_probs_std,
        intraA_probs_std,
        intraB_probs_std,
        labels,
        pred_lengths,
        probs_nc
    ), f)

Inter prob mean AUC: 0.900089
Z Score diff 0.877
Abs mean diff AUC 0.918
Abs mean diff / pooled std AUC 0.917
Unchunked AUC: 0.9703022168840594
Unchunked + Inter-Intra diff: 0.9684649932488104
Unchunked * Inter-Intra diff: 0.9703482092725797


  


In [23]:
# Chunk Sz 10, limited data 
fpr, tpr, thresh = roc_curve(labels, inter_probs_mean)
roc_auc = auc(fpr, tpr)
print('Inter prob mean AUC:', round(roc_auc, 6))

pp = np.abs(inter_probs_mean - intra_probs_mean)/np.sqrt(inter_probs_std**2/n_ab + intra_probs_std**2/(n_a+n_b))
pp[np.isnan(pp)] = 0.5
fpr, tpr, thresh = roc_curve(labels, pp)
roc_auc = 1 - auc(fpr, tpr)
print('Z Score diff', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, (1 - np.abs(inter_probs_mean - intra_probs_mean)))
roc_auc = auc(fpr, tpr)
print('Abs mean diff AUC', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, (1 - 2 * np.abs(inter_probs_mean - intra_probs_mean)/(pooled_std)))
roc_auc = auc(fpr, tpr)
print('Abs mean diff / pooled std AUC', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, probs_nc)
roc_auc = auc(fpr, tpr)
print('Unchunked AUC:', roc_auc)

fpr, tpr, thresh = roc_curve(labels, 0.5 * (probs_nc + (1 - np.abs(inter_probs_mean - intra_probs_mean))))
roc_auc = auc(fpr, tpr)
print('Unchunked + Inter-Intra diff:', roc_auc)


fpr, tpr, thresh = roc_curve(labels, (probs_nc * (1 - np.abs(inter_probs_mean - intra_probs_mean))))
roc_auc = auc(fpr, tpr)
print('Unchunked * Inter-Intra diff:', roc_auc)

with open(RESULTS_PATH + 'results_' + str(chunk_sz) + '_limited_data.p', 'wb') as f:
    pickle.dump((
        inter_probs_mean,
        intraA_probs_mean,
        intraB_probs_mean,
        inter_probs_std,
        intraA_probs_std,
        intraB_probs_std,
        labels,
        pred_lengths,
        probs_nc
    ), f)

Inter prob mean AUC: 0.951128
Z Score diff 0.935
Abs mean diff AUC 0.952
Abs mean diff / pooled std AUC 0.953
Unchunked AUC: 0.9497490308055819
Unchunked + Inter-Intra diff: 0.9534875111906185
Unchunked * Inter-Intra diff: 0.9530706122116563


  


In [16]:
# Chunk Sz 5, limited data
fpr, tpr, thresh = roc_curve(labels, inter_probs_mean)
roc_auc = auc(fpr, tpr)
print('Inter prob mean AUC:', round(roc_auc, 6))

pp = np.abs(inter_probs_mean - intra_probs_mean)/np.sqrt(inter_probs_std**2/n_ab + intra_probs_std**2/(n_a+n_b))
pp[np.isnan(pp)] = 0.5
fpr, tpr, thresh = roc_curve(labels, pp)
roc_auc = 1 - auc(fpr, tpr)
print('Z Score diff', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, (1 - np.abs(inter_probs_mean - intra_probs_mean)))
roc_auc = auc(fpr, tpr)
print('Abs mean diff AUC', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, (1 - 2 * np.abs(inter_probs_mean - intra_probs_mean)/(pooled_std)))
roc_auc = auc(fpr, tpr)
print('Abs mean diff / pooled std AUC', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, probs_nc)
roc_auc = auc(fpr, tpr)
print('Unchunked AUC:', roc_auc)

fpr, tpr, thresh = roc_curve(labels, 0.5 * (probs_nc + (1 - np.abs(inter_probs_mean - intra_probs_mean))))
roc_auc = auc(fpr, tpr)
print('Unchunked + Inter-Intra diff:', roc_auc)


fpr, tpr, thresh = roc_curve(labels, (probs_nc * (1 - np.abs(inter_probs_mean - intra_probs_mean))))
roc_auc = auc(fpr, tpr)
print('Unchunked * Inter-Intra diff:', roc_auc)

with open(RESULTS_PATH + 'results_' + str(chunk_sz) + '_limited_data.p', 'wb') as f:
    pickle.dump((
        inter_probs_mean,
        intraA_probs_mean,
        intraB_probs_mean,
        inter_probs_std,
        intraA_probs_std,
        intraB_probs_std,
        labels,
        pred_lengths,
        probs_nc
    ), f)

Inter prob mean AUC: 0.952075
Z Score diff 0.948
Abs mean diff AUC 0.949
Abs mean diff / pooled std AUC 0.955
Unchunked AUC: 0.9497490308055819
Unchunked + Inter-Intra diff: 0.9513507595545881
Unchunked * Inter-Intra diff: 0.9508999975616909


  


In [49]:
# Chunk Sz 30, full training data
fpr, tpr, thresh = roc_curve(labels, inter_probs_mean)
roc_auc = auc(fpr, tpr)
print('Inter prob mean AUC:', round(roc_auc, 6))

pp = np.abs(inter_probs_mean - intra_probs_mean)/np.sqrt(inter_probs_std**2/n_ab + intra_probs_std**2/(n_a+n_b))
pp[np.isnan(pp)] = 0.5
fpr, tpr, thresh = roc_curve(labels, pp)
roc_auc = 1 - auc(fpr, tpr)
print('Z Score diff', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, (1 - np.abs(inter_probs_mean - intra_probs_mean)))
roc_auc = auc(fpr, tpr)
print('Abs mean diff AUC', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, (1 - 2 * np.abs(inter_probs_mean - intra_probs_mean)/(pooled_std)))
roc_auc = auc(fpr, tpr)
print('Abs mean diff / pooled std AUC', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, probs_nc)
roc_auc = auc(fpr, tpr)
print('Unchunked AUC:', roc_auc)

fpr, tpr, thresh = roc_curve(labels, 0.5 * (probs_nc + (1 - np.abs(inter_probs_mean - intra_probs_mean))))
roc_auc = auc(fpr, tpr)
print('Unchunked + Inter-Intra diff:', roc_auc)


fpr, tpr, thresh = roc_curve(labels, (probs_nc * (1 - np.abs(inter_probs_mean - intra_probs_mean))))
roc_auc = auc(fpr, tpr)
print('Unchunked * Inter-Intra diff:', roc_auc)

with open(RESULTS_PATH + 'results_' + str(chunk_sz) + '.p', 'wb') as f:
    pickle.dump((
        inter_probs_mean,
        intraA_probs_mean,
        intraB_probs_mean,
        inter_probs_std,
        intraA_probs_std,
        intraB_probs_std,
        labels,
        pred_lengths,
        probs_nc
    ), f)

Inter prob mean AUC: 0.956919
Z Score diff 0.918
Abs mean diff AUC 0.954
Abs mean diff / pooled std AUC 0.954
Unchunked AUC: 0.9703022168840594
Unchunked + Inter-Intra diff: 0.9710349130076832
Unchunked * Inter-Intra diff: 0.9717032780131589


  
  


In [14]:
# Chunk Sz 15, full training data
fpr, tpr, thresh = roc_curve(labels, inter_probs_mean)
roc_auc = auc(fpr, tpr)
print('Inter prob mean AUC:', round(roc_auc, 6))

pp = np.abs(inter_probs_mean - intra_probs_mean)/np.sqrt(inter_probs_std**2/n_ab + intra_probs_std**2/(n_a+n_b))
pp[np.isnan(pp)] = 0.5
fpr, tpr, thresh = roc_curve(labels, pp)
roc_auc = 1 - auc(fpr, tpr)
print('Z Score diff', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, (1 - np.abs(inter_probs_mean - intra_probs_mean)))
roc_auc = auc(fpr, tpr)
print('Abs mean diff AUC', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, (1 - 2 * np.abs(inter_probs_mean - intra_probs_mean)/(pooled_std)))
roc_auc = auc(fpr, tpr)
print('Abs mean diff / pooled std AUC', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, probs_nc)
roc_auc = auc(fpr, tpr)
print('Unchunked AUC:', roc_auc)

fpr, tpr, thresh = roc_curve(labels, 0.5 * (probs_nc + (1 - np.abs(inter_probs_mean - intra_probs_mean))))
roc_auc = auc(fpr, tpr)
print('Unchunked + Inter-Intra diff:', roc_auc)


fpr, tpr, thresh = roc_curve(labels, (probs_nc * (1 - np.abs(inter_probs_mean - intra_probs_mean))))
roc_auc = auc(fpr, tpr)
print('Unchunked * Inter-Intra diff:', roc_auc)

with open(RESULTS_PATH + 'results_' + str(chunk_sz) + '.p', 'wb') as f:
    pickle.dump((
        inter_probs_mean,
        intraA_probs_mean,
        intraB_probs_mean,
        inter_probs_std,
        intraA_probs_std,
        intraB_probs_std,
        labels,
        pred_lengths,
        probs_nc
    ), f)

Inter prob mean AUC: 0.95748
Z Score diff 0.946
Abs mean diff AUC 0.957
Abs mean diff / pooled std AUC 0.957
Unchunked AUC: 0.9703022168840594
Unchunked + Inter-Intra diff: 0.9719958386477112
Unchunked * Inter-Intra diff: 0.9722288960019245


  


In [14]:
# Chunk Sz 5, full training data
fpr, tpr, thresh = roc_curve(labels, inter_probs_mean)
roc_auc = auc(fpr, tpr)
print('Inter prob mean AUC:', round(roc_auc, 6))

pp = np.abs(inter_probs_mean - intra_probs_mean)/np.sqrt(inter_probs_std**2/n_ab + intra_probs_std**2/(n_a+n_b))
pp[np.isnan(pp)] = 0.5
fpr, tpr, thresh = roc_curve(labels, pp)
roc_auc = 1 - auc(fpr, tpr)
print('Z Score diff', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, (1 - np.abs(inter_probs_mean - intra_probs_mean)))
roc_auc = auc(fpr, tpr)
print('Abs mean diff AUC', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, (1 - 2 * np.abs(inter_probs_mean - intra_probs_mean)/(pooled_std)))
roc_auc = auc(fpr, tpr)
print('Abs mean diff / pooled std AUC', round(roc_auc, 3))

fpr, tpr, thresh = roc_curve(labels, probs_nc)
roc_auc = auc(fpr, tpr)
print('Unchunked AUC:', roc_auc)

fpr, tpr, thresh = roc_curve(labels, 0.5 * (probs_nc + (1 - np.abs(inter_probs_mean - intra_probs_mean))))
roc_auc = auc(fpr, tpr)
print('Unchunked + Inter-Intra diff:', roc_auc)


fpr, tpr, thresh = roc_curve(labels, (probs_nc * (1 - np.abs(inter_probs_mean - intra_probs_mean))))
roc_auc = auc(fpr, tpr)
print('Unchunked * Inter-Intra diff:', roc_auc)

with open(RESULTS_PATH + 'results_' + str(chunk_sz) + '.p', 'wb') as f:
    pickle.dump((
        inter_probs_mean,
        intraA_probs_mean,
        intraB_probs_mean,
        inter_probs_std,
        intraA_probs_std,
        intraB_probs_std,
        labels,
        pred_lengths,
        probs_nc
    ), f)

Inter prob mean AUC: 0.918796
Z Score diff 0.92
Abs mean diff AUC 0.924
Abs mean diff / pooled std AUC 0.926
Unchunked AUC: 0.9703022168840594
Unchunked + Inter-Intra diff: 0.9683624895720556
Unchunked * Inter-Intra diff: 0.9701088768052554


  
