In [3]:
import json
import random
import numpy as np
import copy
import pandas as pd
import os
from tqdm import tqdm

random.seed(8888)

In [4]:
gold = json.load(open('data/crossnews_gold.json', 'r', encoding='utf-8'))
silver = json.load(open('data/crossnews_silver.json', 'r', encoding='utf-8'))

In [5]:
gold_authors = {}
silver_authors = {}

for docs, authors in [
    (gold, gold_authors),
    (silver, silver_authors)
]:
    for doc in docs:
        author = doc['author']
        authors[author] = authors.get(author, []) + [doc]

In [6]:
print(len(gold), len(gold_authors))
print(len(silver), len(silver_authors))

221420 500
1258077 2260


In [7]:
def list_stats(values):
    return f'total: {round(sum(values), 0)} count: {len(values)} mean: {round(np.mean(values), 3)} quartiles: {round(np.percentile(values, 25), 3)}/{round(np.percentile(values, 50), 3)}/{round(np.percentile(values, 75), 3)} std: {round(np.std(values), 3)}'

In [8]:
def print_author_statistics(authors):
    print(f'Number of authors: {len(authors)}')
    articles, tweets = [], []
    for author in authors.values():
        if len([len(doc['text']) for doc in author if doc['genre'] == 'Article']) > 0:
            articles.append([len(doc['text']) for doc in author if doc['genre'] == 'Article'])
        if len([len(doc['text']) for doc in author if doc['genre'] == 'Tweet']) > 0:
            tweets.append([len(doc['text']) for doc in author if doc['genre'] == 'Tweet'])
        
    print('Articles per author: ' + list_stats([len(x) for x in articles]))
    print('Chars per article per author: ' + list_stats([sum(x) / len(x) if len(x) > 0 else 0 for x in articles]))
    print('Tweets per author: ' + list_stats([len(x) for x in tweets]))
    print('Chars per tweet per author: ' + list_stats([sum(x) / len(x) if len(x) > 0 else 0 for x in tweets]))

In [61]:
def stack_documents(authors, min_char_threshold, upper_char_limit=5000, method='random', is_train=False):
    authors = copy.deepcopy(authors)
    new_authors = {author: [] for author in authors.keys()}
    for author, old_docs in authors.items():
        articles = [doc for doc in old_docs if doc['genre'] == 'Article']
        tweets = [doc for doc in old_docs if doc['genre'] == 'Tweet']
        
        for docs in [articles, tweets]:
            if method == 'random':
                random.shuffle(docs)
            elif method == 'greedy':
                docs = sorted(docs, key=lambda x: len(x['text']), reverse=True)
            new_doc = None
            for doc in docs:
                if new_doc is None:
                    new_doc = doc
                else:
                    new_doc['text'] += f' <new> {doc["text"]}'
                if len(new_doc['text']) >= min_char_threshold:
                    text = new_doc['text']
                    # if training data, can split long data into multiple upper_char_limit-sized documents
                    for i in range(0, max(len(text) // upper_char_limit, 1) if is_train else 1):
                        new_doc['text'] = text[upper_char_limit*i:upper_char_limit*(i+1)]
                        if len(new_doc['text']) >= min_char_threshold:
                            new_authors[author].append(copy.deepcopy(new_doc))
                    new_doc = None
    return new_authors

In [10]:
print_author_statistics(silver_authors)

Number of authors: 2260
Articles per author: total: 107184 count: 2260 mean: 47.427 quartiles: 2.0/6.0/27.0 std: 191.738
Chars per article per author: total: 15776807.0 count: 2260 mean: 6980.888 quartiles: 3698.479/5743.333/8416.25 std: 5959.486
Tweets per author: total: 1150893 count: 2260 mean: 509.245 quartiles: 500.0/599.0/600.0 std: 303.773
Chars per tweet per author: total: 303510.0 count: 2260 mean: 134.296 quartiles: 102.865/128.322/160.662 std: 42.124


In [11]:
print_author_statistics(stack_documents(silver_authors, 500, method='greedy', is_train=True))

Number of authors: 2260
Articles per author: total: 136061 count: 2259 mean: 60.231 quartiles: 2.0/8.0/36.0 std: 231.529
Chars per article per author: total: 8925082.0 count: 2259 mean: 3950.9 quartiles: 3304.036/4446.889/4938.791 std: 1198.798
Tweets per author: total: 279677 count: 2235 mean: 125.135 quartiles: 88.0/121.0/156.5 std: 84.547
Chars per tweet per author: total: 1276488.0 count: 2235 mean: 571.135 quartiles: 563.406/572.097/579.425 std: 13.599


In [12]:
print_author_statistics(gold_authors)

Number of authors: 500
Articles per author: total: 96743 count: 500 mean: 193.486 quartiles: 100.0/245.0/250.0 std: 136.588
Chars per article per author: total: 1554626.0 count: 500 mean: 3109.252 quartiles: 298.642/2148.323/4298.092 std: 3960.874
Tweets per author: total: 124677 count: 500 mean: 249.354 quartiles: 100.0/100.0/599.0 std: 238.977
Chars per tweet per author: total: 79036.0 count: 500 mean: 158.072 quartiles: 110.446/148.33/206.921 std: 57.186


In [13]:
print_author_statistics(stack_documents(gold_authors, 500, method='greedy'))

Number of authors: 500
Articles per author: total: 86632 count: 500 mean: 173.264 quartiles: 46.75/242.0/249.0 std: 150.648
Chars per article per author: total: 1147242.0 count: 500 mean: 2294.484 quartiles: 651.54/2152.378/3549.734 std: 1503.377
Tweets per author: total: 31407 count: 500 mean: 62.814 quartiles: 24.0/37.5/96.25 std: 58.092
Chars per tweet per author: total: 288086.0 count: 500 mean: 576.173 quartiles: 566.254/575.862/586.281 std: 15.773


In [62]:
stacked_golds = stack_documents(gold_authors, 500, method='greedy')
stacked_silvers = stack_documents(silver_authors, 500, method='greedy', is_train=True)

In [15]:
ss = list(stacked_silvers.values())
articles = [[doc['id'] for doc in auth_docs if doc['genre'] == 'Article'] for auth_docs in ss]
print(len(articles))
print(sum([len(x) for x in articles]))

tweets = [[doc['id'] for doc in auth_docs if doc['genre'] == 'Tweet'] for auth_docs in ss]
print(len(tweets))
print(sum([len(x) for x in tweets]))

a = [min(len(x), len(y), 100) for x, y in zip(articles, tweets)]
print(sum(a))
print(sum([1 if x >= 1 else 0 for x in a]))

2260
136061
2260
279677
51878
2234


In [75]:
def generate_pair_ids(first_list, second_list, max_docs_per_author=100):
    pairs = []
    used_articles = []
    used_tweets = []
    id_to_auth = {}
    
    def to_pair_id(s1, s2):
        same = 1 if id_to_auth[s1] == id_to_auth[s2] else 0
        # return f'{same}_{s2}_{s1}' if int(s1) > int(s2) else f'{same}_{s1}_{s2}'
        return f'{same}_{s1}_{s2}'

    for auth in first_list.keys():
        articles = first_list[auth]
        tweets = second_list[auth]
        for i in range(min(len(articles), len(tweets), max_docs_per_author)):
            used_articles.append(articles[i])
            used_tweets.append(tweets[i])
            id_to_auth[articles[i]] = auth
            id_to_auth[tweets[i]] = auth
            pairs.append(to_pair_id(articles[i], tweets[i]))
            
    for _ in tqdm(range(len(used_articles))):
        article = used_articles.pop()
        tweet = random.choice(used_tweets)
        while id_to_auth[tweet] == id_to_auth[article]:
            tweet = random.choice(used_tweets)
        used_tweets.remove(tweet)
        pairs.append(to_pair_id(article, tweet))
    
    return pairs
     

In [76]:
def get_pair_entries(pair_ids, data):
    pairs = []
    id_to_doc = {}
    for author_docs in data.values():
        id_to_doc.update({str(doc['id']): doc for doc in author_docs})
    
    random.shuffle(pair_ids)
    
    for pair in pair_ids:
        same, first_id, second_id = tuple(pair.split('_')[:3])
        same = int(same)
        a = id_to_doc[first_id]
        b = id_to_doc[second_id]
        pairs.append((same, a['text'], b['text'], a['author'], b['author']))
    
    return pairs

In [77]:
def print_pair_stats(pairs, save=None):
    print(f'Total pairs: {len(pairs)}; same-pair percent: {sum([pair[0] for pair in pairs]) / len(pairs)}')
    
    authors = set()
    first_genre_lengths, second_genre_lengths = [], []
    for pair in pairs:
        first_length, second_length = len(pair[1]), len(pair[2])
        first_author, second_author = pair[3], pair[4]
        authors.add(first_author)
        authors.add(second_author)
        first_genre_lengths.append(first_length)
        second_genre_lengths.append(second_length)
    
    print(f'Num authors: {len(authors)}')
    print(f'Avg. chars per first genre: {sum(first_genre_lengths) / len(first_genre_lengths)}')
    print(f'Avg. chars per second genre: {sum(second_genre_lengths) / len(second_genre_lengths)}')
    
    if save:
        print(f'Saving to {save}.')
        columns = ['label', 'text0', 'text1']
        df = pd.DataFrame([(pair[0], pair[1], pair[2]) for pair in pairs], columns=columns)
        df.to_csv(save, index=False)
        

In [90]:
def create_verification_pairs(data, first_genre, second_genre, seed, max_docs_per_author):
    all_articles = {author: [str(doc['id']) for doc in docs if doc['genre'] == 'Article'] for author, docs in data.items()}
    all_tweets = {author: [str(doc['id']) for doc in docs if doc['genre'] == 'Tweet'] for author, docs in data.items()}

    random.seed(seed) # do all the shuffling here instead of during sample time
    for auth in all_articles.keys():
        random.shuffle(all_articles[auth])
        random.shuffle(all_tweets[auth])
    
    if first_genre != second_genre: # Article & Tweet
        first_docs = all_articles
        second_docs = all_tweets
    else: # either Article & Article or Tweet & Tweet
        first_docs, second_docs = {}, {} # split all of the single genre into two distinct dicts
        all_docs = all_articles if first_genre == 'Article' else all_tweets
        for auth in all_docs.keys():
            author_docs = all_docs[auth]
            if len(author_docs) >= 2:
                first_docs[auth] = author_docs[len(author_docs) // 2:]
                second_docs[auth] = author_docs[:len(author_docs) // 2]
                
    pair_ids = generate_pair_ids(first_docs, second_docs, max_docs_per_author=max_docs_per_author)
    pairs = get_pair_entries(pair_ids, data)
    return pairs
        

In [108]:
test_pairs_Article_Tweet = create_verification_pairs(stacked_golds, 'Article', 'Tweet', 111, 5)
test_pairs_Article_Article = create_verification_pairs(stacked_golds, 'Article', 'Article', 222, 5)
test_pairs_Tweet_Tweet = create_verification_pairs(stacked_golds, 'Tweet', 'Tweet', 333, 5)
train_pairs_Article_Tweet = create_verification_pairs(stacked_silvers, 'Article', 'Tweet', 444, 100)
train_pairs_Article_Article = create_verification_pairs(stacked_silvers, 'Article', 'Article', 555, 100)
train_pairs_Tweet_Tweet = create_verification_pairs(stacked_silvers, 'Tweet', 'Tweet', 666, 100)

  0%|          | 0/2500 [00:00<?, ?it/s]

100%|██████████| 2500/2500 [00:00<00:00, 171249.20it/s]
100%|██████████| 2500/2500 [00:00<00:00, 202996.03it/s]
100%|██████████| 2500/2500 [00:00<00:00, 154003.05it/s]
100%|██████████| 51911/51911 [00:07<00:00, 7035.56it/s] 
100%|██████████| 38910/38910 [00:04<00:00, 9270.53it/s] 
100%|██████████| 131057/131057 [01:03<00:00, 2078.12it/s] 


In [109]:
os.makedirs('pairs', exist_ok=True)
print_pair_stats(test_pairs_Article_Tweet, save='pairs/test_Article_Tweet.csv')

Total pairs: 5000; same-pair percent: 0.5
Num authors: 500
Avg. chars per first genre: 2286.4376
Avg. chars per second genre: 575.0016
Saving to pairs/test_Article_Tweet.csv.


In [110]:
print_pair_stats(test_pairs_Article_Article, save='pairs/test_Article_Article.csv')

Total pairs: 5000; same-pair percent: 0.5
Num authors: 500
Avg. chars per first genre: 2282.156
Avg. chars per second genre: 2277.48
Saving to pairs/test_Article_Article.csv.


In [111]:
print_pair_stats(test_pairs_Tweet_Tweet, save='pairs/test_Tweet_Tweet.csv')

Total pairs: 5000; same-pair percent: 0.5
Num authors: 500
Avg. chars per first genre: 578.1928
Avg. chars per second genre: 576.2844
Saving to pairs/test_Tweet_Tweet.csv.


In [112]:
print_pair_stats(train_pairs_Article_Tweet, save='pairs/train_Article_Tweet.csv')

Total pairs: 103822; same-pair percent: 0.5
Num authors: 2234
Avg. chars per first genre: 4019.8569667315214
Avg. chars per second genre: 571.5849627246633
Saving to pairs/train_Article_Tweet.csv.


In [113]:
print_pair_stats(train_pairs_Article_Article, save='pairs/train_Article_Article.csv')

Total pairs: 77820; same-pair percent: 0.5
Num authors: 1844
Avg. chars per first genre: 4008.3671549730147
Avg. chars per second genre: 4005.877101002313
Saving to pairs/train_Article_Article.csv.


In [114]:
print_pair_stats(train_pairs_Tweet_Tweet, save='pairs/train_Tweet_Tweet.csv')

Total pairs: 262114; same-pair percent: 0.5
Num authors: 2220
Avg. chars per first genre: 574.3434078301808
Avg. chars per second genre: 574.414834766552
Saving to pairs/train_Tweet_Tweet.csv.
