In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../")

In [3]:
import pickle
import numpy as np
from tqdm.auto import trange, tqdm
from features import get_transformer, merge_entries
import json
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from utills import chunker
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import RandomizedSearchCV


In [4]:
import matplotlib.pyplot as plt
import matplotlib.style as style
import tikzplotlib
%matplotlib inline

In [5]:
from plotly.offline import init_notebook_mode
import plotly.offline as py
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [6]:
DATA_DIR = '../data/pan/'
GROUND_TRUTH_PATH = DATA_DIR + 'pan20-authorship-verification-training-large-truth.jsonl'
DATA_PATH = DATA_DIR + 'pan20-authorship-verification-training-large.jsonl'
PREPROCESSED_DATA_PATH = '../temp_data/pan/'
TEMP_DATA_PATH = '../temp_data/pan/extremely_limitted_data/'
FIGURES_PATH = '../figures/'

MAX_CHUNKS = 15

In [12]:
def fit_transformers(sampled_idxs, max_chunks=MAX_CHUNKS):
    docs_1 = []
    docs_2 = []
    i = 0
    with open(PREPROCESSED_DATA_PATH + 'preprocessed_train.jsonl', 'r') as f:
        for l in tqdm(f):
            if i not in sampled_idxs:
                i += 1
                continue
            d = json.loads(l)    
            docs_1.append(merge_entries(d['pair'][0][:max_chunks]))
            docs_2.append(merge_entries(d['pair'][1][:max_chunks]))
            i += 1
    
    print('Num docs:', len(docs_1))
    transformer = get_transformer()
    scaler = StandardScaler()
    secondary_scaler = StandardScaler()

    X = transformer.fit_transform(docs_1 + docs_2).todense()
    X = scaler.fit_transform(X)
    X1 = X[:len(docs_1)]
    X2 = X[len(docs_1):]
    secondary_scaler.fit(np.abs(X1 - X2))
    
    return transformer, scaler, secondary_scaler


def vectorize(vectorized_x_path, vectorized_y_path, transformer, scaler, secondary_scaler, preprocessed_path, vector_Sz, sampled_idxs, max_chunks=MAX_CHUNKS):

    total_recs = len(sampled_idxs)
    x_shape = (total_recs, len(transformer.get_feature_names()))    
    XX = np.memmap(vectorized_x_path, dtype='float32', mode='w+', shape=x_shape)
    Y = np.memmap(vectorized_y_path, dtype='int32', mode='w+', shape=(total_recs))
    print('Total recs:', total_recs)
    ordered_idxs = np.array(range(len(sampled_idxs)))
    np.random.shuffle(ordered_idxs)
    
    with open(preprocessed_path, 'r') as f:
        batch_size = 10000
        i = 0;
        j = 0;
        docs1 = []
        docs2 = []
        idxs = []
        labels = []
        for l in tqdm(f, total=vector_Sz):
            if i not in sampled_idxs:
                i += 1
                continue
            d = json.loads(l)
            
            docs1.append(merge_entries(d['pair'][0][:max_chunks]))
            docs2.append(merge_entries(d['pair'][1][:max_chunks]))

            labels.append(ground_truth[d['id']])
            idxs.append(ordered_idxs[j])
            i += 1
            j += 1
            if len(labels) >= batch_size:
                x1 = scaler.transform(transformer.transform(docs1).todense())
                x2 = scaler.transform(transformer.transform(docs2).todense())
                XX[idxs, :] = secondary_scaler.transform(np.abs(x1-x2))
                Y[idxs] = labels

                docs1 = []
                docs2 = []
                idxs = []
                labels = []

        x1 = scaler.transform(transformer.transform(docs1).todense())
        x2 = scaler.transform(transformer.transform(docs2).todense())
        XX[idxs, :] = secondary_scaler.transform(np.abs(x1-x2))
        Y[idxs] = labels
        XX.flush()
        Y.flush()
        
    return ordered_idxs, total_recs

In [8]:
ground_truth = {}
with open(GROUND_TRUTH_PATH, 'r') as f:
    for l in f:
        d = json.loads(l)
        ground_truth[d['id']] = d['same']

In [9]:
train_sz = 193536
test_sz = 81963

# with open(PREPROCESSED_DATA_PATH + 'preprocessed_train.jsonl', 'r') as f:
#     for l in f:
#         train_sz += 1

# with open(PREPROCESSED_DATA_PATH + 'preprocessed_test.jsonl', 'r') as f:
#     for l in f:
#         test_sz += 1

print('Train Sz:', train_sz, flush=True)
print('Test Sz:', test_sz, flush=True)

Train Sz: 193536
Test Sz: 81963


In [10]:
with open('../temp_data/pan/split_models/extremely_limitted_data/experiment_data_5_limitted_data.p', 'rb') as f:
    (
        _, 
        _,
        _,
        _,
        _,
        _,
        _,
        _,
        _,
        _,
        sampled_idxs_train,
        sampled_idxs_test
    ) = pickle.load(f)
data_fraction=0.006

In [11]:
train_sz * data_fraction

1161.2160000000001

In [13]:
print('Fitting transformer...', flush=True)
transformer, scaler, secondary_scaler = fit_transformers(sampled_idxs_train, max_chunks=MAX_CHUNKS)
feature_sz = len(transformer.get_feature_names())

Fitting transformer...



Num docs: 1161



The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



In [14]:
with open(TEMP_DATA_PATH + 'model_limitted_data.p', 'wb') as f:
    pickle.dump((transformer, scaler, secondary_scaler), f)

In [15]:
print('Vectorizing train set...', flush=True)
# XX_train = np.memmap(TEMP_DATA_PATH + 'vectorized_XX_train_limitted_data.npy', dtype='float32', mode='w+', shape=(train_sz, feature_sz))
# Y_train = np.memmap(TEMP_DATA_PATH + 'Y_train_limitted_data.npy', dtype='int32', mode='w+', shape=(train_sz))
# train_idxs = np.array(range(train_sz))
# np.random.shuffle(train_idxs)

train_idxs, total_train_recs = vectorize(
    TEMP_DATA_PATH + 'vectorized_XX_train_limitted_data.npy', 
    TEMP_DATA_PATH + 'Y_train_limitted_data.npy', 
    transformer, 
    scaler, 
    secondary_scaler, 
    PREPROCESSED_DATA_PATH + 'preprocessed_train.jsonl',
    train_sz,
    sampled_idxs_train,
    max_chunks=MAX_CHUNKS
)

Vectorizing train set...
Total recs: 1161





In [16]:
print('Vectorizing test set...', flush=True)
# XX_test = np.memmap(TEMP_DATA_PATH + 'vectorized_XX_test_limitted_data.npy', dtype='float32', mode='w+', shape=(test_sz, feature_sz))
# Y_test = np.memmap(TEMP_DATA_PATH + 'Y_test_limitted_data.npy', dtype='int32', mode='w+', shape=(test_sz))
# test_idxs = np.array(range(test_sz))
# np.random.shuffle(test_idxs)

test_idxs, total_test_recs = vectorize(
    TEMP_DATA_PATH + 'vectorized_XX_test_limitted_data.npy', 
    TEMP_DATA_PATH + 'Y_test_limitted_data.npy', 
    transformer, 
    scaler, 
    secondary_scaler, 
    PREPROCESSED_DATA_PATH + 'preprocessed_test.jsonl',
    test_sz,
    sampled_idxs_test,
    max_chunks=MAX_CHUNKS
)

Vectorizing test set...
Total recs: 491





In [17]:
XX_train = np.memmap(TEMP_DATA_PATH + 'vectorized_XX_train_limitted_data.npy', dtype='float32', mode='r', shape=(total_train_recs, feature_sz))
Y_train = np.memmap(TEMP_DATA_PATH + 'Y_train_limitted_data.npy', dtype='int32', mode='r', shape=(total_train_recs))

XX_test = np.memmap(TEMP_DATA_PATH + 'vectorized_XX_test_limitted_data.npy', dtype='float32', mode='r', shape=(total_test_recs, feature_sz))
Y_test = np.memmap(TEMP_DATA_PATH + 'Y_test_limitted_data.npy', dtype='int32', mode='r', shape=(total_test_recs))

In [18]:
print('Training classifier...', flush=True)
clf = SGDClassifier(loss='log', alpha=0.001)
batch_size=1000
num_epochs = 50
aucs = []
for i in trange(num_epochs):
    print('Epoch - ', i)
    print('-' * 30)
    for idxs in chunker(range(total_test_recs), batch_size):
        clf.partial_fit(XX_train[idxs, :], Y_train[idxs], classes=[0, 1])

    probs = clf.predict_proba(XX_test)[:, 1]
    fpr, tpr, thresh = roc_curve(Y_test, probs)
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    print('AUC: ', roc_auc)
    with open(TEMP_DATA_PATH + 'experiment_data_limitted_data.p', 'wb') as f:
        pickle.dump((
            aucs,
            clf,
            roc_auc,
            transformer, 
            scaler,
            secondary_scaler,
            feature_sz,
            train_sz,
            train_idxs,
            test_sz,
            test_idxs
        ), f)

Training classifier...


Epoch -  0
------------------------------
AUC:  0.727609830722671
Epoch -  1
------------------------------
AUC:  0.7545229305929696
Epoch -  2
------------------------------
AUC:  0.7704030729322559
Epoch -  3
------------------------------
AUC:  0.7697961355548906
Epoch -  4
------------------------------
AUC:  0.7771292693471681
Epoch -  5
------------------------------
AUC:  0.7793159067478133
Epoch -  6
------------------------------
AUC:  0.779806445176095
Epoch -  7
------------------------------
AUC:  0.7804050683428115
Epoch -  8
------------------------------
AUC:  0.7833399847018525
Epoch -  9
------------------------------
AUC:  0.783364927333799
Epoch -  10
------------------------------
AUC:  0.7835644683893711
Epoch -  11
------------------------------
AUC:  0.783938607868569
Epoch -  12
------------------------------
AUC:  0.7846619441950181
Epoch -  13
------------------------------
AUC:  0.7848697994612391
Epoch -  14
------------------------------
AUC:  0.78463700156

In [19]:
idxs

range(0, 1000)

In [19]:
go.Figure(go.Scatter(
    x=np.arange(len(aucs)),
    y=aucs
))

In [21]:
go.Figure(go.Scatter(
    x=np.arange(len(aucs)),
    y=aucs
))

In [23]:
aucs[-1]

0.952142633488032

In [20]:
with open(TEMP_DATA_PATH + 'model_limitted_data.p', 'wb') as f:
    pickle.dump((clf, transformer, scaler, secondary_scaler), f)