In [1]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict
import pickle
from plotly.offline import init_notebook_mode
import plotly.offline as py
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [19]:
DATA_DIR = '../data/pan/'
GROUND_TRUTH_PATH = DATA_DIR + 'pan20-authorship-verification-training-large-truth.jsonl'
DATA_PATH = DATA_DIR + 'pan20-authorship-verification-training-large.jsonl'
TEMP_DATA_PATH = '../temp_data/pan/'

In [6]:
id_to_authors = {}
with open(GROUND_TRUTH_PATH, 'r') as f:
    for l in f:
        d = json.loads(l)
        id_to_authors[d['id']] = d['authors']

data = []
with open(DATA_PATH, 'r') as f:
    for l in f:
        d = json.loads(l)
        data.append([
            d['id'],
            d['fandoms'][0],
            d['fandoms'][1],
            id_to_authors[d['id']][0],
            id_to_authors[d['id']][1],
            id_to_authors[d['id']][0] == id_to_authors[d['id']][1],
        ])

In [7]:
df = pd.DataFrame(data=data, columns=['index', 'fandom1', 'fandom2', 'author1', 'author2', 'label']).set_index('index')

In [8]:
# The number of training records from each same-author author
t = df.loc[df['label']==True].groupby('author1')['author1'].count()
go.Figure().add_trace(go.Histogram(x=t))

In [9]:
# Number of documents from different-author authors
t1 = df.loc[df['label']==False]['author1'].values
t2 = df.loc[df['label']==False]['author2'].values

different_author_authors, counts = np.unique(np.concatenate([t1, t2]), return_counts=True)
go.Figure().add_trace(go.Histogram(x=counts))

In [10]:
same_author_authors = df.loc[df['label']==True]['author1'].unique()

In [11]:
print('# Same-author authors: ', len(same_author_authors))
print('# diff-author authors:', len(different_author_authors))
print('Intersection:', len(np.intersect1d(same_author_authors, different_author_authors)))

# Same-author authors:  41370
# diff-author authors: 251503
Intersection: 14704


In [12]:
author_to_id = defaultdict(set)
for i, r in df.iterrows():
    author_to_id[r['author1']].add(i)
    author_to_id[r['author2']].add(i)
    


In [13]:
def get_associated_authors_and_probs(author, author_to_id, id_to_authors):
    authors = set()
    idxs = set()
    _get_associated_authors_and_probs(authors, idxs, author, author_to_id, id_to_authors)
    return list(authors), list(idxs)
    

def _get_associated_authors_and_probs(ret_authors, ret_idxs, author, author_to_id, id_to_authors):
    if author in ret_authors:
        return
    ret_authors.add(author)
    for idx in author_to_id[author]:
        ret_idxs.add(idx)
        for a in id_to_authors[idx]:
            if a not in ret_authors:
                _get_associated_authors_and_probs(ret_authors, ret_idxs, a, author_to_id, id_to_authors)
                ret_authors.add(a)

In [14]:
train_authors = set()
test_authors = set()
train_ids = set()
test_ids = set()

for a in np.concatenate([same_author_authors, different_author_authors]):
    if np.random.rand() < 0.70:
        if a in test_authors:
            continue
        train_authors.add(a)
        authors, probs = get_associated_authors_and_probs(a, author_to_id, id_to_authors)
        train_authors.update(authors)
        train_ids.update(probs)

    else:
        if a in train_authors:
            continue
        test_authors.add(a)
        authors, probs = get_associated_authors_and_probs(a, author_to_id, id_to_authors)
        test_authors.update(authors)
        test_ids.update(probs)

train_a = np.unique(np.concatenate([df.loc[list(train_ids)]['author1'].values, df.loc[list(train_ids)]['author2'].values]))
test_a = np.unique(np.concatenate([df.loc[list(test_ids)]['author1'].values, df.loc[list(test_ids)]['author2'].values]))
assert len(np.intersect1d(train_a, test_a)) == 0, 'Train and test authors are mixed!'

print('Fraction of train authors:', len(train_ids)/(len(train_ids) + len(test_ids)))

Fraction of train authors: 0.7024966785970975


In [15]:
train_df = df.loc[list(train_ids)]
test_df = df.loc[list(test_ids)]

In [16]:
train_df['label'].mean(), test_df['label'].mean()

(0.5383319988635483, 0.5314101000243961)

In [17]:
len(df) - (len(train_df) + len(test_df))

0

In [21]:
with open(TEMP_DATA_PATH + 'dataset_partition.p', 'wb') as f:
    pickle.dump((train_ids, test_ids, train_authors, test_authors), f)