In [266]:
import json
import random
import numpy as np
import copy
import pandas as pd
import os

random.seed(8888)

In [267]:
gold = json.load(open('data/crossnews_gold.json', 'r', encoding='utf-8'))
silver = json.load(open('data/crossnews_silver.json', 'r', encoding='utf-8'))

In [268]:
gold_authors = {}
silver_authors = {}

for docs, authors in [
    (gold, gold_authors),
    (silver, silver_authors)
]:
    for doc in docs:
        author = doc['author']
        authors[author] = authors.get(author, []) + [doc]

In [269]:
print(len(gold), len(gold_authors))
print(len(silver), len(silver_authors))

167404 337
1202716 5665


In [270]:
def doc_thresholds(authors, threshold, genre=None, filter=None):
    result = set()
    if filter is None:
        filter = lambda x: True
    for author_id, author_docs in authors.items():
        doc_num = sum([1 if filter(doc) and (genre is None or doc['genre'] == genre) else 0 for doc in author_docs])
        if doc_num >= threshold:
            result.add(author_id)
    return result

In [271]:
def print_example(authors, author_id, genre=None):
    author_docs = [doc for doc in authors[author_id] if genre is None or doc['genre'] == genre]
    doc = random.choice(author_docs)
    print('Document genre:', doc['genre'])
    print('Document length:', len(doc['text']))
    print(doc['text'][:300])

In [272]:
def length_filter(doc, length=100):
    return len(doc['text']) >= length

gold_filtered = doc_thresholds(gold_authors, 100, genre='Tweet', filter=length_filter)
silver_filtered = doc_thresholds(silver_authors, 1, genre='Article', filter=length_filter)
print(len(gold_filtered), len(silver_filtered))

237 5614


In [273]:
def list_stats(values):
    return f'total: {round(sum(values), 0)} count: {len(values)} mean: {round(np.mean(values), 3)} quartiles: {round(np.percentile(values, 25), 3)}/{round(np.percentile(values, 50), 3)}/{round(np.percentile(values, 75), 3)} std: {round(np.std(values), 3)}'

In [274]:
def print_author_statistics(authors):
    print(f'Number of authors: {len(authors)}')
    articles, tweets = [], []
    for author in authors.values():
        if len([len(doc['text']) for doc in author if doc['genre'] == 'Article']) > 0:
            articles.append([len(doc['text']) for doc in author if doc['genre'] == 'Article'])
        if len([len(doc['text']) for doc in author if doc['genre'] == 'Tweet']) > 0:
            tweets.append([len(doc['text']) for doc in author if doc['genre'] == 'Tweet'])
        
    print('Articles per author: ' + list_stats([len(x) for x in articles]))
    print('Chars per article per author: ' + list_stats([sum(x) / len(x) if len(x) > 0 else 0 for x in articles]))
    print('Tweets per author: ' + list_stats([len(x) for x in tweets]))
    print('Chars per tweet per author: ' + list_stats([sum(x) / len(x) if len(x) > 0 else 0 for x in tweets]))

In [275]:
print_author_statistics(gold_authors)

Number of authors: 337
Articles per author: total: 58563 count: 337 mean: 173.777 quartiles: 100.0/100.0/250.0 std: 160.576
Chars per article per author: total: 1149009.0 count: 337 mean: 3409.523 quartiles: 267.45/374.27/5753.864 std: 4751.575
Tweets per author: total: 108841 count: 337 mean: 322.97 quartiles: 100.0/100.0/600.0 std: 260.896
Chars per tweet per author: total: 50401.0 count: 337 mean: 149.558 quartiles: 94.492/141.16/211.67 std: 59.551


In [276]:
def stack_documents(authors, threshold, upper_char_limit=2500, method='random', is_test=False):
    authors = copy.deepcopy(authors)
    new_authors = {author: [] for author in authors.keys()}
    for author, old_docs in authors.items():
        articles = [doc for doc in old_docs if doc['genre'] == 'Article']
        tweets = [doc for doc in old_docs if doc['genre'] == 'Tweet']
        
        for docs in [articles, tweets]:
            if method == 'random':
                random.shuffle(docs)
            elif method == 'greedy':
                docs = sorted(docs, key=lambda x: len(x['text']), reverse=True)
            new_doc = None
            for doc in docs:
                if new_doc is None:
                    new_doc = doc
                else:
                    new_doc['text'] += f'<new> {doc["text"]}'
                if len(new_doc['text']) >= threshold:
                    text = new_doc['text']
                    # only want to create multiple pairs from same datum in train setup
                    for i in range(0, max(len(text) // upper_char_limit, 1) if not is_test else 1):
                        new_doc['text'] = text[upper_char_limit*i:upper_char_limit*(i+1)]
                        if len(new_doc['text']) >= threshold:
                            new_authors[author].append(copy.deepcopy(new_doc))
                    new_doc = None
    return new_authors

In [277]:
print_author_statistics(silver_authors)

Number of authors: 5665
Articles per author: total: 158984 count: 5614 mean: 28.319 quartiles: 2.0/4.0/17.0 std: 112.14
Chars per article per author: total: 42858989.0 count: 5614 mean: 7634.305 quartiles: 3158.0/5503.0/8810.329 std: 8183.977
Tweets per author: total: 1043732 count: 1925 mean: 542.198 quartiles: 500.0/600.0/600.0 std: 371.168
Chars per tweet per author: total: 236446.0 count: 1925 mean: 122.829 quartiles: 91.49/116.665/149.153 std: 42.284


In [278]:
print_author_statistics(stack_documents(silver_authors, 500, method='greedy'))

Number of authors: 5665
Articles per author: total: 355047 count: 5610 mean: 63.288 quartiles: 3.0/12.0/43.0 std: 216.016
Chars per article per author: total: 12344966.0 count: 5610 mean: 2200.529 quartiles: 2194.083/2481.302/2500.0 std: 530.454
Tweets per author: total: 237174 count: 1900 mean: 124.828 quartiles: 80.0/117.0/153.25 std: 101.02
Chars per tweet per author: total: 1078316.0 count: 1900 mean: 567.535 quartiles: 559.752/568.952/576.468 std: 13.842


In [279]:
print_author_statistics(gold_authors)

Number of authors: 337
Articles per author: total: 58563 count: 337 mean: 173.777 quartiles: 100.0/100.0/250.0 std: 160.576
Chars per article per author: total: 1149009.0 count: 337 mean: 3409.523 quartiles: 267.45/374.27/5753.864 std: 4751.575
Tweets per author: total: 108841 count: 337 mean: 322.97 quartiles: 100.0/100.0/600.0 std: 260.896
Chars per tweet per author: total: 50401.0 count: 337 mean: 149.558 quartiles: 94.492/141.16/211.67 std: 59.551


In [280]:
print_author_statistics(stack_documents(gold_authors, 500, method='greedy', is_test=True))

Number of authors: 337
Articles per author: total: 48219 count: 337 mean: 143.083 quartiles: 43.0/54.0/249.0 std: 175.025
Chars per article per author: total: 490332.0 count: 337 mean: 1454.992 quartiles: 625.6/690.919/2476.576 std: 893.271
Tweets per author: total: 25228 count: 337 mean: 74.861 quartiles: 34.0/41.0/115.0 std: 59.273
Chars per tweet per author: total: 194301.0 count: 337 mean: 576.561 quartiles: 564.259/575.409/588.7 std: 18.603


In [281]:
print_example(gold_authors, random.choice(tuple(gold_filtered)), genre='Tweet')
print_example(silver_authors, random.choice(tuple(silver_filtered)), genre='Article')

Document genre: Tweet
Document length: 79
<PERSON> Thank you <PERSON>. Pretentious, yes. But also not a whole lot of fun.
Document genre: Article
Document length: 888
TREVOR PHILLIPS
Dark times call for plain speaking not platitudes
Ukrainians show us that sometimes offence is the best response when democracy is threatened
The Times
That thing about never meeting your heroes is mostly nonsense. I blame Proust’s À La Recherche du Temps Perdu (a book more quoted fr


In [282]:
stacked_golds = stack_documents(gold_authors, 500, method='greedy', is_test=True)
stacked_silvers = stack_documents(silver_authors, 500, method='greedy')

In [283]:
def create_verification_pairs(data, first_genre, second_genre):
    data = copy.deepcopy(data)
    first_docs = {
        author_name: [doc for doc in author_docs if doc['genre'] == first_genre]
            for author_name, author_docs in data.items()
    }
    if first_genre == second_genre:
        second_docs = first_docs
    else:
        second_docs = {
            author_name: [doc for doc in author_docs if doc['genre'] == second_genre]
                for author_name, author_docs in data.items()
        }
        
    delete_threshold = 0 if first_genre != second_genre else 1
        
    for author in data.keys():
        random.shuffle(first_docs[author])
        random.shuffle(second_docs[author])
        
        if len(first_docs[author]) <= delete_threshold:
            del first_docs[author]
        if len(second_docs[author]) <= delete_threshold:
            del second_docs[author]
    
    # each pair is a 3-tuple of (label, first_text, second_text), where label == 0 if different authors and 1 if same author
    pairs = []
   
    next_pick = 'same'
    # need at least two authors for each genre to pick pairs
    while len(first_docs) > 1 and len(second_docs) > 1:
        # pick authors for next pair
        first_author_pool = list(first_docs.keys())
        second_author_pool = list(second_docs.keys())
        if next_pick == 'diff':
            first_author = random.choice(first_author_pool)
            second_author = random.choice(second_author_pool)
            while first_author == second_author:
                second_author = random.choice(second_author_pool)
        elif next_pick == 'same':
            first_author = random.choice(first_author_pool)
            # try picking 10 random authors, if this doesn't work, then iterate through all authors
            if first_author not in second_author_pool:
                for _ in range(10):
                    first_author = random.choice(first_author_pool)
                    if first_author in second_author_pool:
                        break
            if first_author not in second_author_pool:
                random.shuffle(first_author_pool)
                found_same_author = False
                for author in first_author_pool:
                    if author in second_author_pool:
                        first_author = author
                        found_same_author = True
                        break
                if not found_same_author:
                    break # bailing, no same authors left
            second_author = first_author
        
        if len(first_docs[first_author]) == 0 or len(second_docs[second_author]) == 0:
            print(first_author, second_author, len(first_docs[first_author]), len(second_docs[second_author]))
        pairs.append((1 if next_pick == 'same' else 0, first_docs[first_author].pop()['text'], second_docs[second_author].pop()['text'], first_author, second_author))
        
        if len(first_docs[first_author]) <= delete_threshold:
            del first_docs[first_author]
        if (first_genre != second_genre or first_author != second_author) and len(second_docs[second_author]) <= delete_threshold:
            del second_docs[second_author]
                
        # alternate pair type
        next_pick = 'diff' if next_pick == 'same' else 'same'

    return pairs

In [284]:
def print_pair_stats(pairs, save=None):
    print(f'Total pairs: {len(pairs)}; same-pair percent: {sum([pair[0] for pair in pairs]) / len(pairs)}')
    
    authors = set()
    first_genre_lengths, second_genre_lengths = [], []
    for pair in pairs:
        first_length, second_length = len(pair[1]), len(pair[2])
        first_author, second_author = pair[3], pair[4]
        authors.add(first_author)
        authors.add(second_author)
        first_genre_lengths.append(first_length)
        second_genre_lengths.append(second_length)
    
    print(f'Num authors: {len(authors)}')
    print(f'Avg. chars per first genre: {sum(first_genre_lengths) / len(first_genre_lengths)}')
    print(f'Avg. chars per second genre: {sum(second_genre_lengths) / len(second_genre_lengths)}')
    
    if save:
        print(f'Saving to {save}.')
        columns = ['label', 'text0', 'text1']
        df = pd.DataFrame([(pair[0], pair[1], pair[2]) for pair in pairs], columns=columns)
        df.to_csv(save, index=False)
        

In [285]:
test_pairs_Article_X = create_verification_pairs(stacked_golds, 'Article', 'Tweet')
test_pairs_Article_Article = create_verification_pairs(stacked_golds, 'Article', 'Article')
test_pairs_X_X = create_verification_pairs(stacked_golds, 'Tweet', 'Tweet')
train_pairs = create_verification_pairs(stacked_silvers, 'Article', 'Tweet')

In [286]:
os.makedirs('pairs', exist_ok=True)
print_pair_stats(test_pairs_Article_X, save='pairs/test_Article_X.csv')

Total pairs: 25124; same-pair percent: 0.5
Num authors: 337
Avg. chars per first genre: 1843.932375417927
Avg. chars per second genre: 574.1184126731412
Saving to pairs/test_Article_X.csv.


In [287]:
print_pair_stats(test_pairs_Article_Article, save='pairs/test_Article_Article.csv')

Total pairs: 22952; same-pair percent: 0.5
Num authors: 337
Avg. chars per first genre: 2101.9971244336007
Avg. chars per second genre: 2100.6907023353083
Saving to pairs/test_Article_Article.csv.


In [288]:
print_pair_stats(test_pairs_X_X, save='pairs/test_X_X.csv')

Total pairs: 12436; same-pair percent: 0.5
Num authors: 337
Avg. chars per first genre: 574.2765358636217
Avg. chars per second genre: 573.853168221293
Saving to pairs/test_X_X.csv.


In [289]:
print_pair_stats(train_pairs, save='pairs/train_Article_X.csv')

Total pairs: 104482; same-pair percent: 0.5
Num authors: 5662
Avg. chars per first genre: 2313.8631056067074
Avg. chars per second genre: 569.3298175762333
Saving to pairs/train_Article_X.csv.
