In [2]:
import json
import random
import numpy as np
import copy
import pandas as pd
import os

random.seed(8888)

In [3]:
gold = json.load(open('data/crossnews_gold.json', 'r', encoding='utf-8'))
silver = json.load(open('data/crossnews_silver.json', 'r', encoding='utf-8'))

In [4]:
gold_authors = {}
silver_authors = {}

for docs, authors in [
    (gold, gold_authors),
    (silver, silver_authors)
]:
    for doc in docs:
        author = doc['author']
        authors[author] = authors.get(author, []) + [doc]

In [5]:
print(len(gold), len(gold_authors))
print(len(silver), len(silver_authors))

167404 337
1202716 5665


In [6]:
def doc_thresholds(authors, threshold, genre=None, filter=None):
    result = set()
    if filter is None:
        filter = lambda x: True
    for author_id, author_docs in authors.items():
        doc_num = sum([1 if filter(doc) and (genre is None or doc['genre'] == genre) else 0 for doc in author_docs])
        if doc_num >= threshold:
            result.add(author_id)
    return result

In [7]:
def print_example(authors, author_id, genre=None):
    author_docs = [doc for doc in authors[author_id] if genre is None or doc['genre'] == genre]
    doc = random.choice(author_docs)
    print('Document genre:', doc['genre'])
    print('Document length:', len(doc['text']))
    print(doc['text'][:300])

In [8]:
def length_filter(doc, length=100):
    return len(doc['text']) >= length

gold_filtered = doc_thresholds(gold_authors, 100, genre='Tweet', filter=length_filter)
silver_filtered = doc_thresholds(silver_authors, 1, genre='Article', filter=length_filter)
print(len(gold_filtered), len(silver_filtered))

237 5614


In [9]:
def list_stats(values):
    return f'total: {round(sum(values), 0)} count: {len(values)} mean: {round(np.mean(values), 3)} quartiles: {round(np.percentile(values, 25), 3)}/{round(np.percentile(values, 50), 3)}/{round(np.percentile(values, 75), 3)} std: {round(np.std(values), 3)}'

In [10]:
def print_author_statistics(authors):
    print(f'Number of authors: {len(authors)}')
    articles, tweets = [], []
    for author in authors.values():
        if len([len(doc['text']) for doc in author if doc['genre'] == 'Article']) > 0:
            articles.append([len(doc['text']) for doc in author if doc['genre'] == 'Article'])
        if len([len(doc['text']) for doc in author if doc['genre'] == 'Tweet']) > 0:
            tweets.append([len(doc['text']) for doc in author if doc['genre'] == 'Tweet'])
        
    print('Articles per author: ' + list_stats([len(x) for x in articles]))
    print('Chars per article per author: ' + list_stats([sum(x) / len(x) if len(x) > 0 else 0 for x in articles]))
    print('Tweets per author: ' + list_stats([len(x) for x in tweets]))
    print('Chars per tweet per author: ' + list_stats([sum(x) / len(x) if len(x) > 0 else 0 for x in tweets]))

In [11]:
print_author_statistics(gold_authors)

Number of authors: 337
Articles per author: total: 58563 count: 337 mean: 173.777 quartiles: 100.0/100.0/250.0 std: 160.576
Chars per article per author: total: 1149009.0 count: 337 mean: 3409.523 quartiles: 267.45/374.27/5753.864 std: 4751.575
Tweets per author: total: 108841 count: 337 mean: 322.97 quartiles: 100.0/100.0/600.0 std: 260.896
Chars per tweet per author: total: 50401.0 count: 337 mean: 149.558 quartiles: 94.492/141.16/211.67 std: 59.551


In [12]:
def stack_documents(authors, threshold, method='random'):
    authors = copy.deepcopy(authors)
    new_authors = {author: [] for author in authors.keys()}
    for author, old_docs in authors.items():
        articles = [doc for doc in old_docs if doc['genre'] == 'Article']
        tweets = [doc for doc in old_docs if doc['genre'] == 'Tweet']
        
        for docs in [articles, tweets]:
            if method == 'random':
                random.shuffle(docs)
            elif method == 'greedy':
                docs = sorted(docs, key=lambda x: len(x['text']), reverse=True)
            new_doc = None
            for doc in docs:
                if new_doc is None:
                    new_doc = doc
                else:
                    new_doc['text'] += f'<new> {doc["text"]}'
                if len(new_doc['text']) >= threshold:
                    new_authors[author].append(new_doc)
                    new_doc = None
    return new_authors

In [13]:
print_author_statistics(silver_authors)

Number of authors: 5665
Articles per author: total: 158984 count: 5614 mean: 28.319 quartiles: 2.0/4.0/17.0 std: 112.14
Chars per article per author: total: 42858989.0 count: 5614 mean: 7634.305 quartiles: 3158.0/5503.0/8810.329 std: 8183.977
Tweets per author: total: 1043732 count: 1925 mean: 542.198 quartiles: 500.0/600.0/600.0 std: 371.168
Chars per tweet per author: total: 236446.0 count: 1925 mean: 122.829 quartiles: 91.49/116.665/149.153 std: 42.284


In [14]:
print_author_statistics(stack_documents(silver_authors, 500, method='greedy'))

Number of authors: 5665
Articles per author: total: 158736 count: 5610 mean: 28.295 quartiles: 2.0/4.0/17.0 std: 111.949
Chars per article per author: total: 42895067.0 count: 5610 mean: 7646.179 quartiles: 3167.466/5512.055/8825.95 std: 8184.857
Tweets per author: total: 237174 count: 1900 mean: 124.828 quartiles: 80.0/117.0/153.25 std: 101.02
Chars per tweet per author: total: 1078316.0 count: 1900 mean: 567.535 quartiles: 559.752/568.952/576.468 std: 13.842


In [15]:
print_author_statistics(gold_authors)

Number of authors: 337
Articles per author: total: 58563 count: 337 mean: 173.777 quartiles: 100.0/100.0/250.0 std: 160.576
Chars per article per author: total: 1149009.0 count: 337 mean: 3409.523 quartiles: 267.45/374.27/5753.864 std: 4751.575
Tweets per author: total: 108841 count: 337 mean: 322.97 quartiles: 100.0/100.0/600.0 std: 260.896
Chars per tweet per author: total: 50401.0 count: 337 mean: 149.558 quartiles: 94.492/141.16/211.67 std: 59.551


In [16]:
print_author_statistics(stack_documents(gold_authors, 500, method='greedy'))

Number of authors: 337
Articles per author: total: 48219 count: 337 mean: 143.083 quartiles: 43.0/54.0/249.0 std: 175.025
Chars per article per author: total: 1221457.0 count: 337 mean: 3624.501 quartiles: 625.6/690.919/5880.651 std: 4646.772
Tweets per author: total: 25228 count: 337 mean: 74.861 quartiles: 34.0/41.0/115.0 std: 59.273
Chars per tweet per author: total: 194301.0 count: 337 mean: 576.561 quartiles: 564.259/575.409/588.7 std: 18.603


In [17]:
# print_example(gold_authors, random.choice(tuple(gold_filtered)), genre='Tweet')
print_example(silver_authors, random.choice(tuple(silver_filtered)), genre='Article')

Document genre: Article
Document length: 7362
The commander of the scandal-plagued California National Guard steps down
The head of the California National Guard, who has presided over a series of scandals during the last 3½ years, will retire at the end of the month, Gov. <PERSON>’s office confirmed Monday.
Maj. Gen. <PERSON>’s departure comes


In [18]:
stacked_golds = stack_documents(gold_authors, 500, method='greedy')
stacked_silvers = stack_documents(silver_authors, 500, method='greedy')

In [19]:
def create_verification_pairs(data, first_genre, second_genre, add_imbalanced=False):
    data = copy.deepcopy(data)
    first_docs = {
        author_name: [doc for doc in author_docs if doc['genre'] == first_genre]
            for author_name, author_docs in data.items()
    }
    if first_genre == second_genre:
        second_docs = first_docs
    else:
        second_docs = {
            author_name: [doc for doc in author_docs if doc['genre'] == second_genre]
                for author_name, author_docs in data.items()
        }
        
    overflow_first, overflow_second = [], []
    
    for author in data.keys():
        random.shuffle(first_docs[author])
        random.shuffle(second_docs[author])
        
        if len(first_docs[author]) == 0 or len(second_docs[author]) == 0:
            overflow_first.extend(first_docs[author])
            overflow_second.extend(second_docs[author])
            
            del first_docs[author]
            del second_docs[author]
    
    # each pair is a 3-tuple of (label, first_text, second_text), where label == 0 if different authors and 1 if same author
    pairs = []
   
    next_pick = 'same'
    while len(first_docs) > 1:
        # pick authors for next pair
        
        
        first_author = sorted(list(first_docs.keys()), key=lambda author: len(first_docs[author]), reverse=True)[0]
        second_author = first_author
        if next_pick == 'diff':
            # second_author = sorted(list(first_docs.keys()), key=lambda author: len(first_docs[author]), reverse=True)[1]
            while second_author == first_author:
                second_author = random.choice(list(first_docs.keys()))
        
        if len(first_docs[first_author]) == 0 or len(second_docs[second_author]) == 0:
            print(first_author, second_author, len(first_docs[first_author]), len(second_docs[second_author]))
        pairs.append((1 if next_pick == 'same' else 0, first_docs[first_author].pop()['text'], second_docs[second_author].pop()['text']))
        
        delete_threshold = 0 if first_genre != second_genre else 1
        
        # now, if either list is empty (or list has one element and first_genre == second_genre), delete from both docs dicts
        if len(first_docs[first_author]) <= delete_threshold:
            del first_docs[first_author]
            if first_genre != second_genre:
                overflow_second.extend(second_docs[first_author])
                del second_docs[first_author]
        if second_author in first_docs and len(first_docs[second_author]) <= delete_threshold:
            del second_docs[second_author]
            if first_genre != second_genre:
                overflow_first.extend(first_docs[second_author])
                del first_docs[second_author]
        if first_author in second_docs and len(second_docs[first_author]) <= delete_threshold:
            del first_docs[first_author]
            if first_genre != second_genre:
                overflow_second.extend(second_docs[first_author])
                del second_docs[first_author]
        if second_author in second_docs and len(second_docs[second_author]) <= delete_threshold:
            del second_docs[second_author]
            if first_genre != second_genre:
                overflow_first.extend(first_docs[second_author])
                del first_docs[second_author]
                
        # alternate pair type
        next_pick = 'diff' if next_pick == 'same' else 'same'
        
    if add_imbalanced and first_genre != second_genre:
        # guaranteed to be diff pairs b/c overflow is added when length of one genre == 0
        random.shuffle(overflow_first)
        random.shuffle(overflow_second)
        for i in range(min(len(overflow_first), len(overflow_second))):
            pairs.append((0, overflow_first[i]['text'], overflow_second[i]['text']))
    
    return pairs

In [20]:
def print_pair_stats(pairs, save=None):
    print(f'Total pairs: {len(pairs)}; same-pair percent: {sum([pair[0] for pair in pairs]) / len(pairs)}')
    if save:
        print(f'Saving to {save}.')
        columns = ['label', 'text0', 'text1']
        df = pd.DataFrame(pairs, columns=columns)
        df.to_csv(save, index=False)
        

In [21]:
os.makedirs('pairs')
pairs = create_verification_pairs(stacked_golds, 'Tweet', 'Tweet')
print_pair_stats(pairs, save='pairs/test_X_X.csv')

Total pairs: 12470; same-pair percent: 0.5
Saving to pairs/test_X_X.csv.


In [22]:
pairs = create_verification_pairs(stacked_golds, 'Article', 'Tweet')
print_pair_stats(pairs, save='pairs/test_Article_X.csv')

Total pairs: 24675; same-pair percent: 0.5000202634245188
Saving to pairs/test_Article_X.csv.


In [23]:
pairs = create_verification_pairs(stacked_golds, 'Article', 'Article')
print_pair_stats(pairs, save='pairs/test_Article_Article.csv')

Total pairs: 23968; same-pair percent: 0.5
Saving to pairs/test_Article_Article.csv.


In [24]:
pairs = create_verification_pairs(stacked_silvers, 'Article', 'Tweet')
print_pair_stats(pairs, save='pairs/train_Article_X.csv')

Total pairs: 51859; same-pair percent: 0.5000096415279893
Saving to pairs/train_Article_X.csv.


In [25]:
pairs = create_verification_pairs(stacked_silvers, 'Article', 'Tweet', add_imbalanced=True)
print_pair_stats(pairs, save='pairs/train_Article_X_imbalanced.csv')

Total pairs: 127973; same-pair percent: 0.20340228016847303
Saving to pairs/train_Article_X_imbalanced.csv.
