In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../")

In [3]:
import pickle
import numpy as np
from tqdm.auto import trange, tqdm
from features import get_transformer, merge_entries
import json
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from utills import chunker, get_num_chunks, cartesian_product
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import RandomizedSearchCV


In [4]:
import matplotlib.pyplot as plt
import matplotlib.style as style
import tikzplotlib
%matplotlib inline

In [5]:
from plotly.offline import init_notebook_mode
import plotly.offline as py
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [6]:
DATA_DIR = '../data/pan/'
GROUND_TRUTH_PATH = DATA_DIR + 'pan20-authorship-verification-training-large-truth.jsonl'
DATA_PATH = DATA_DIR + 'pan20-authorship-verification-training-large.jsonl'
PREPROCESSED_DATA_PATH = '../temp_data/pan/'
TEMP_DATA_PATH = '../temp_data/pan/split_models/'
FIGURES_PATH = '../figures/'


MAX_CHUNKS = 15

In [7]:
def fit_transformers(chunk_sz, data_fraction=0.01, max_chunks=MAX_CHUNKS):
    docs_1 = []
    docs_2 = []
    author_bounds_1 = {}
    author_bounds_2 = {}
    i = 0
    with open(PREPROCESSED_DATA_PATH + 'preprocessed_train.jsonl', 'r') as f:
        for l in tqdm(f):
            if np.random.rand() < data_fraction:
                d = json.loads(l)
                docs_merged = [merge_entries(c) for c in list(chunker(d['pair'][0][:max_chunks], chunk_sz))]
                author_bounds_1[i] = (len(docs_1), len(docs_1) + len(docs_merged))
                docs_1.extend(docs_merged)
                
                docs_merged = [merge_entries(c) for c in list(chunker(d['pair'][1][:max_chunks], chunk_sz))]
                author_bounds_2[i] = (len(docs_2), len(docs_2) + len(docs_merged))
                docs_2.extend(docs_merged)
                i += 1

    num_recs = i            
    transformer = get_transformer()
    scaler = StandardScaler()
    secondary_scaler = StandardScaler()

    X = transformer.fit_transform(docs_1 + docs_2).todense()
    X = scaler.fit_transform(X)
    X1 = X[:len(docs_1)]
    X2 = X[len(docs_1):]
    
    XX = []
    for i in trange(num_recs):
        s_1, e_1 = author_bounds_1[i]
        s_2, e_2 = author_bounds_2[i]
        
        idxs = cartesian_product(range(s_1, e_1), range(s_2, e_2))
        XX.extend(np.abs(X1[idxs[:, 0], :] - X2[idxs[:, 1], :]))
    
    secondary_scaler.fit(XX)
    
    return transformer, scaler, secondary_scaler

def vectorize(vectorized_x_path, vectorized_y_path, ground_truth, transformer, scaler, secondary_scaler, preprocessed_path, vector_sz, chunk_sz, data_fraction=0.25, max_chunks=MAX_CHUNKS):
    author_bounds = {}
    total_recs = 0
    sampled_ids = []
    with open(preprocessed_path, 'r') as f:
        i = 0;
        for l in tqdm(f, total=vector_sz):
            if np.random.rand() > data_fraction:
                continue
            d = json.loads(l)
            sampled_ids.append(d['id'])
            n_doc1_chunks = get_num_chunks(d['pair'][0][:max_chunks], chunk_sz)
            n_doc2_chunks = get_num_chunks(d['pair'][1][:max_chunks], chunk_sz)
            author_bounds[i] = (total_recs, total_recs + n_doc1_chunks * n_doc2_chunks)
            
            total_recs += n_doc1_chunks * n_doc2_chunks
            i += 1
    ordered_idxs = np.array(range(len(sampled_ids)))
    np.random.shuffle(ordered_idxs)
    # Shuffle author_bounds mapping
    shuffled_author_bounds = {}
    temp = {}
    b = 0
    for i in range(len(ordered_idxs)):
        s, e = author_bounds[ordered_idxs[i]]
        temp[i] = (b, b + (e - s))
        b += (e - s)

    for i in range(len(ordered_idxs)):
        shuffled_author_bounds[ordered_idxs[i]] = temp[i]
    assert len(shuffled_author_bounds) == len(author_bounds) and len(author_bounds) == len(sampled_ids)
    x_shape = (total_recs, len(transformer.get_feature_names()))    
    XX = np.memmap(vectorized_x_path, dtype='float32', mode='w+', shape=x_shape)
    Y = np.memmap(vectorized_y_path, dtype='int32', mode='w+', shape=(total_recs))
    
    with open(preprocessed_path, 'r') as f:
        i = 0;
        for l in tqdm(f, total=vector_sz):
            d = json.loads(l)
            if d['id'] not in sampled_ids:
                continue
            try:
                s, e = shuffled_author_bounds[i]
            except:
                print('Error:', i, d['id'])
                i +=1
                continue
            docs_merged_1 = [merge_entries(c) for c in list(chunker(d['pair'][0][:max_chunks], chunk_sz))]
            docs_merged_2 = [merge_entries(c) for c in list(chunker(d['pair'][1][:max_chunks], chunk_sz))]

            X_1 = scaler.transform(transformer.transform(docs_merged_1).todense())
            X_2 = scaler.transform(transformer.transform(docs_merged_2).todense())
            c_idxs = cartesian_product(range(len(X_1)), range(len(X_2)))

            XX[s:e, :] = secondary_scaler.transform(np.abs(X_1[c_idxs[:, 0], :] - X_2[c_idxs[:, 1], :]))
            Y[s:e] = ground_truth[d['id']]

            i += 1
    
    XX.flush()
    Y.flush()
    return ordered_idxs, total_recs

In [8]:
ground_truth = {}
with open(GROUND_TRUTH_PATH, 'r') as f:
    for l in f:
        d = json.loads(l)
        ground_truth[d['id']] = d['same']

In [9]:
train_sz = 193536
test_sz = 81963

# with open(PREPROCESSED_DATA_PATH + 'preprocessed_train.jsonl', 'r') as f:
#     for l in f:
#         train_sz += 1

# with open(PREPROCESSED_DATA_PATH + 'preprocessed_test.jsonl', 'r') as f:
#     for l in f:
#         test_sz += 1

print('Train Sz:', train_sz, flush=True)
print('Test Sz:', test_sz, flush=True)

Train Sz: 193536
Test Sz: 81963


Fit Transofrmers
===

In [10]:
print('Fitting transformer...', flush=True)
chunk_sz = 10
transformer, scaler, secondary_scaler = fit_transformers(data_fraction=0.01, chunk_sz=chunk_sz, max_chunks=MAX_CHUNKS)
feature_sz = len(transformer.get_feature_names())

Fitting transformer...






The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'






In [11]:
with open(TEMP_DATA_PATH + 'model_' + str(chunk_sz) + '_limitted_data.p', 'wb') as f:
    pickle.dump((transformer, scaler, secondary_scaler, chunk_sz), f)

In [None]:
print('Vectorizing train set...', flush=True)
# train_idxs = np.array(range(train_sz))
# np.random.shuffle(train_idxs)

train_idxs, total_recs_train = vectorize(
    TEMP_DATA_PATH + 'split_vectorized_XX_train_' + str(chunk_sz) + '_limitted_data.npy', 
    TEMP_DATA_PATH + 'split_Y_train_' + str(chunk_sz) + '_limitted_data.npy', 
#     train_idxs, 
    ground_truth,
    transformer, 
    scaler, 
    secondary_scaler, 
    PREPROCESSED_DATA_PATH + 'preprocessed_train.jsonl',
    train_sz,
    chunk_sz,
    data_fraction=0.25,
    max_chunks=MAX_CHUNKS
)

Vectorizing train set...





In [27]:
# vectorized_x_path, vectorized_y_path
vectorized_x_path = TEMP_DATA_PATH + 'split_vectorized_XX_train_' + str(chunk_sz) + '_limitted_data.npy'
vectorized_y_path = TEMP_DATA_PATH + 'split_Y_train_' + str(chunk_sz) + '_limitted_data.npy'
preprocessed_path = PREPROCESSED_DATA_PATH + 'preprocessed_train.jsonl'
max_chunks=MAX_CHUNKS
data_fraction=0.25

In [28]:
author_bounds = {}
total_recs = 0
sampled_ids = []
with open(preprocessed_path, 'r') as f:
    i = 0;
    for l in tqdm(f, total=train_sz):
        if np.random.rand() > data_fraction:
            continue
        d = json.loads(l)
        sampled_ids.append(d['id'])
        n_doc1_chunks = get_num_chunks(d['pair'][0][:max_chunks], chunk_sz)
        n_doc2_chunks = get_num_chunks(d['pair'][1][:max_chunks], chunk_sz)
        author_bounds[i] = (total_recs, total_recs + n_doc1_chunks * n_doc2_chunks)

        total_recs += n_doc1_chunks * n_doc2_chunks
        i += 1
ordered_idxs = np.array(range(len(sampled_ids)))
np.random.shuffle(ordered_idxs)
# Shuffle author_bounds mapping
shuffled_author_bounds = {}
temp = {}
b = 0
for i in range(len(ordered_idxs)):
    s, e = author_bounds[ordered_idxs[i]]
    temp[i] = (b, b + (e - s))
    b += (e - s)

for i in range(len(ordered_idxs)):
    shuffled_author_bounds[ordered_idxs[i]] = temp[i]




In [29]:
ordered_idxs = np.array(range(len(sampled_ids)))
np.random.shuffle(ordered_idxs)
# Shuffle author_bounds mapping
shuffled_author_bounds = {}
temp = {}
b = 0
for i in range(len(ordered_idxs)):
    s, e = author_bounds[ordered_idxs[i]]
    temp[i] = (b, b + (e - s))
    b += (e - s)

for i in range(len(ordered_idxs)):
    shuffled_author_bounds[ordered_idxs[i]] = temp[i]

In [32]:
len(sampled_ids), len(author_bounds), len(shuffled_author_bounds)

(48533, 48533, 48533)

In [44]:
vectorized_x_path

'../temp_data/pan/split_models/split_vectorized_XX_train_10_limitted_data.npy'

In [None]:
#^^ stopped here

In [33]:
x_shape = (total_recs, len(transformer.get_feature_names()))    
XX = np.memmap(vectorized_x_path, dtype='float32', mode='w+', shape=x_shape)
Y = np.memmap(vectorized_y_path, dtype='int32', mode='w+', shape=(total_recs))

In [21]:
len(set(shuffled_author_bounds.keys()))

20542

In [35]:
train_idxs = ordered_idxs
total_recs_train = total_recs

In [41]:

with open(preprocessed_path, 'r') as f:
    i = 0;
    for l in tqdm(f, total=train_sz):
        d = json.loads(l)
        if d['id'] not in sampled_ids:
            continue
        s, e = shuffled_author_bounds[i]
        docs_merged_1 = [merge_entries(c) for c in list(chunker(d['pair'][0][:max_chunks], chunk_sz))]
        docs_merged_2 = [merge_entries(c) for c in list(chunker(d['pair'][1][:max_chunks], chunk_sz))]

        X_1 = scaler.transform(transformer.transform(docs_merged_1).todense())
        X_2 = scaler.transform(transformer.transform(docs_merged_2).todense())
        c_idxs = cartesian_product(range(len(X_1)), range(len(X_2)))

        XX[s:e, :] = secondary_scaler.transform(np.abs(X_1[c_idxs[:, 0], :] - X_2[c_idxs[:, 1], :]))
        Y[s:e] = ground_truth[d['id']]
        i += 1

XX.flush()
Y.flush()




KeyError: 48533

In [42]:
XX.flush()
Y.flush()

In [50]:
XX.shape, len(shuffled_author_bounds)

((194132, 8885), 48533)

In [31]:
print('Vectorizing test set...', flush=True)

# test_idxs = np.array(range(test_sz))
# np.random.shuffle(test_idxs)

test_idxs, total_recs_test = vectorize(
    TEMP_DATA_PATH + 'split_vectorized_XX_test_' + str(chunk_sz) + '_limitted_data.npy', 
    TEMP_DATA_PATH + 'split_Y_test_' + str(chunk_sz) + '_limitted_data.npy', 
#     test_idxs, 
    ground_truth,
    transformer, 
    scaler, 
    secondary_scaler, 
    PREPROCESSED_DATA_PATH + 'preprocessed_test.jsonl',
    test_sz,
    chunk_sz,
    data_fraction=0.25,
    max_chunks=MAX_CHUNKS
)


Vectorizing test set...








ValueError: could not broadcast input array from shape (9,7248) into shape (6,7248)

In [45]:
total_recs_train

194132

In [46]:
with open(TEMP_DATA_PATH + 'experiment_data_' + str(chunk_sz) + '_limitted_data.p', 'wb') as f:
    pickle.dump((
        transformer, 
        scaler,
        secondary_scaler,
        feature_sz,
        train_sz,
        train_idxs,
        test_sz,
        test_idxs,
        total_recs_train,
        total_recs_test
    ), f)

In [47]:
XX_train = np.memmap(TEMP_DATA_PATH + 'split_vectorized_XX_train_' + str(chunk_sz) + '_limitted_data.npy', dtype='float32', mode='r', shape=(total_recs_train, feature_sz))
Y_train = np.memmap(TEMP_DATA_PATH + 'split_Y_train_' + str(chunk_sz) + '_limitted_data.npy', dtype='int32', mode='r', shape=(total_recs_train))

XX_test = np.memmap(TEMP_DATA_PATH + 'split_vectorized_XX_test_' + str(chunk_sz) + '_limitted_data.npy', dtype='float32', mode='r', shape=(total_recs_test, feature_sz))
Y_test = np.memmap(TEMP_DATA_PATH + 'split_Y_test_' + str(chunk_sz) + '_limitted_data.npy', dtype='int32', mode='r', shape=(total_recs_test))

In [49]:
print('Training classifier...', flush=True)
clf = SGDClassifier(loss='log', alpha=0.01)
batch_size=10000
num_epochs = 50
aucs = []
for i in trange(num_epochs):
    print('Epoch - ', i)
    print('-' * 30)
    for idxs in chunker(range(total_recs_train), batch_size):
        clf.partial_fit(XX_train[idxs, :], Y_train[idxs], classes=[0, 1])

    probs = clf.predict_proba(XX_test)[:, 1]
    fpr, tpr, thresh = roc_curve(Y_test, probs)
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    print('AUC: ', roc_auc)
    with open(TEMP_DATA_PATH + 'experiment_data_' + str(chunk_sz) + '_limitted_data.p', 'wb') as f:
        pickle.dump((
            aucs,
            clf,
            roc_auc,
            transformer, 
            scaler,
            secondary_scaler,
            feature_sz,
            train_sz,
            train_idxs,
            test_sz,
            test_idxs,
            total_recs_train,
            total_recs_test
        ), f)

Training classifier...


Epoch -  0
------------------------------
AUC:  0.8370158365884414
Epoch -  1
------------------------------
AUC:  0.8509624042927079
Epoch -  2
------------------------------
AUC:  0.8648538949144418
Epoch -  3
------------------------------
AUC:  0.8693963771327675
Epoch -  4
------------------------------
AUC:  0.874498552867186
Epoch -  5
------------------------------
AUC:  0.877090151760655
Epoch -  6
------------------------------
AUC:  0.8794289351351223
Epoch -  7
------------------------------
AUC:  0.8804566622310492
Epoch -  8
------------------------------
AUC:  0.8814697905607122
Epoch -  9
------------------------------
AUC:  0.8826867544691316
Epoch -  10
------------------------------
AUC:  0.8832392809288736
Epoch -  11
------------------------------
AUC:  0.8836481534351401
Epoch -  12
------------------------------
AUC:  0.8841378404341655
Epoch -  13
------------------------------
AUC:  0.884711360166012
Epoch -  14
------------------------------
AUC:  0.8848079563

In [None]:
# Run stops here

In [51]:
go.Figure(go.Scatter(
    x=np.arange(len(aucs)),
    y=aucs
))

Exception ignored in: <object repr() failed>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tqdm/std.py", line 1084, in __del__
    self.close()
  File "/usr/local/lib/python3.6/dist-packages/tqdm/notebook.py", line 241, in close
    super(tqdm_notebook, self).close(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tqdm/std.py", line 1260, in close
    if self.disable:
AttributeError: 'tqdm_notebook' object has no attribute 'disable'


In [52]:
chunk_sz

10

In [53]:
with open(TEMP_DATA_PATH + 'model_' + str(chunk_sz) + '_limitted_data.p', 'wb') as f:
    pickle.dump((clf, transformer, scaler, secondary_scaler), f)