In [4]:
import os
import csv
import json
import pprint

In [5]:
PROBLEM_LABELS = ['satire', 'political', 'reliable']
PP = pprint.PrettyPrinter(indent=4)

In [6]:
def read_misinfo_sources(filepath):
    sources = {}
    with open(filepath, 'r') as f:
        r = csv.reader(f, delimiter='\t')
        next(r)
        for row in r:
            url = row[0].strip()
            labels = list(filter(lambda l: l is not None and len(l.strip()) > 0, [l.strip() for l in row[1:]]))
            sources[url] = labels
    return sources

In [7]:
def load_news_sources(filepath):
    sources = set()
    with open(filepath, 'r') as f:
        r = csv.reader(f, delimiter='\t')
        next(r)
        for row in r:
            sources.add(row[0])
    return sources

In [8]:
def problem_sources(sources):
    problem = {}
    maybe_problem = {}
    for url, labels in sources.items():
        num_non_problem = sum([0 if label in PROBLEM_LABELS else 1 for label in labels])
        num_problem = len(labels) - num_non_problem
        if num_non_problem == 0:
            problem[url] = labels
        elif num_problem > 0:
            maybe_problem[url] = labels
    return problem, maybe_problem

In [9]:
def load_shares(filepath):
    with open(filepath, 'r') as f:
        js_data = json.loads(f.read())
    return js_data

In [10]:
def count(tweets, misinfo, news):
    domains = set()
    shares_count = 0
    user_counts = {}
    tweet_counts = {}
    for uid, utweets in tweets.items():
        for t in utweets:
            for d in t['domains']:
                if d not in domains:
                    domains.add(d)
                shares_count += 1
                if d in news:
                    if 'news' not in tweet_counts:
                        tweet_counts['news'] = 0
                    tweet_counts['news'] += 1
                    
                    if 'news' not in user_counts:
                        user_counts['news'] = set()
                    user_counts['news'].add(uid)
                    
                elif d in misinfo:
                    for label_raw in misinfo[d]:
                        label = label_raw.lower()
                        
                        if label not in tweet_counts:
                            tweet_counts[label] = 0
                        tweet_counts[label] += 1
                        
                        if label not in user_counts:
                            user_counts[label] = set()
                        user_counts[label].add(uid)
                else:
                    if 'other' not in tweet_counts:
                        tweet_counts['other'] = 0
                    tweet_counts['other'] += 1
                    
                    if 'other' not in user_counts:
                        user_counts['other'] = set()
                    user_counts['other'].add(uid)
        ucounts = {}
        for label, s in user_counts.items():
            ucounts[label] = len(s)
    return len(domains), shares_count, ucounts, tweet_counts

In [11]:
def filter_tweets(tweets, problem_domains):
    new_tweets = {}
    for uid, utweets in tweets.items():
        new_utweets = []
        for t in utweets:
            add_tweet = True
            for d in t['domains']:
                if d in problem_domains:
                    add_tweet = False
                    break
            if add_tweet:
                new_utweets.append(t)
        if len(new_utweets) > 0:
            new_tweets[uid] = new_utweets
    return new_tweets

In [12]:
sources = read_misinfo_sources(os.path.join(os.getenv('D'), 'sources', 'pollution.tab'))

In [13]:
problem, maybe_problem = problem_sources(sources)

In [14]:
print('sources: {}, problem: {}, maybe problem: {}'.format(len(sources), len(problem), len(maybe_problem)))

sources: 824, problem: 138, maybe problem: 59


In [15]:
tweets = load_shares(os.path.join(os.getenv('D'), 'stripped-dataset-no-bots.json'))

In [16]:
print('Num users: {}'.format(len(tweets)))

Num users: 15057


In [17]:
num_tweets = sum([len(vals) for vals in tweets.values()])
print('Num tweets: {}'.format(num_tweets))

Num tweets: 1413426


In [18]:
news_sources = load_news_sources(os.path.join(os.getenv('D'), 'sources', 'news.tab'))

In [19]:
domains_count, shares_count, ucount, tcounts = count(tweets, sources, news_sources)

In [20]:
print('Domains: {}, Shares: {}'.format(domains_count, shares_count))
PP.pprint(tcounts)

Domains: 47442, Shares: 1473250
{   'bias': 55064,
    'clickbait': 11590,
    'conspiracy': 24964,
    'fake': 8027,
    'hate': 20101,
    'junksci': 427,
    'news': 469966,
    'other': 924744,
    'political': 15253,
    'reliable': 2202,
    'rumor': 3885,
    'satire': 904,
    'state': 1423,
    'unreliable': 12547}


In [21]:
filtered_tweets = filter_tweets(tweets, problem.keys())

In [22]:
print('Num users: {}'.format(len(filtered_tweets)))

Num users: 15057


In [23]:
num_filtered_tweets = sum([len(vals) for vals in filtered_tweets.values()])
print('Num filtered tweets: {}, difference: {}, difference percentage: {}'.format(num_filtered_tweets, num_tweets-num_filtered_tweets, (num_tweets-num_filtered_tweets)/num_tweets))

Num filtered tweets: 1398552, difference: 14874, difference percentage: 0.010523366628320124


In [24]:
retweets = 0

In [25]:
filtered_domains_count, filtered_shares_count, filtered_ucount, filtered_tcounts = count(filtered_tweets, sources, news_sources)

In [26]:
print('Domains: {}, Shares: {}'.format(filtered_domains_count, filtered_shares_count))
PP.pprint(filtered_tcounts)

Domains: 47364, Shares: 1457663
{   'bias': 55064,
    'clickbait': 11589,
    'conspiracy': 24964,
    'fake': 8027,
    'hate': 20101,
    'junksci': 427,
    'news': 463409,
    'other': 924063,
    'political': 7769,
    'rumor': 3885,
    'satire': 240,
    'state': 1423,
    'unreliable': 12546}


In [27]:
PP.pprint(filtered_ucount)

{   'bias': 9833,
    'clickbait': 4489,
    'conspiracy': 4801,
    'fake': 3133,
    'hate': 4127,
    'junksci': 237,
    'news': 15054,
    'other': 15014,
    'political': 3561,
    'rumor': 1364,
    'satire': 156,
    'state': 698,
    'unreliable': 5244}


In [129]:
with open(os.path.join(os.getenv('D'), 'stripped-tweets-no-bots-filtered.json'), 'w') as f:
    f.write(json.dumps(filtered_tweets))

In [134]:
55064+11589+24964+8027+20101+427+3885+1423+12546


138026