In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import os
import csv
import json
import logging
from common_utils import readcol, gentweets, write_csv, filter_dataset
from urls import domain

In [4]:
logging.basicConfig(level=logging.DEBUG)

In [5]:
def compute_pollution(data, misinfo_sites, min_num_tweets):
    total_counts = {}
    misinfo_counts = {}
    for uid in data:
        assert uid not in total_counts
        assert uid not in misinfo_counts
    
        total_counts[uid] = 0
        misinfo_counts[uid] = 0
        
        for tweet in data[uid]:
            for raw_d in tweet['domains']:
                total_counts[uid] += 1
                d = domain(raw_d)
                if d in misinfo_sites:
                    misinfo_counts[uid] += 1

    scores = {}
    for uid in misinfo_counts:
        if total_counts[uid] < min_num_tweets:
            logging.info('User {} does not have the required minimum number of tweets ({}/{}). Skipping.'.format(
                uid, total_counts[uid], min_num_tweets
            ))
            continue
        else:
            scores[uid] = misinfo_counts[uid] / total_counts[uid]
    return scores

In [7]:
min_num_tweets = 8
keep_retweets = True
subf = 'with-retweets' if keep_retweets else 'without-retweets'
dest = os.path.join(os.getenv('D'), 'measures', subf, 'pollution-filtered.tab')

with open(os.path.join(os.getenv('D'), 'stripped-dataset-no-bots-filtered.json'), 'r') as f:
    js_data = json.loads(f.read())
data = filter_dataset(js_data, keep_standalone=True, keep_retweet=keep_retweets, keep_quote=True)
misinfo_sites = frozenset(readcol(os.path.join(os.getenv('D'), 'sources', 'pollution.tab'), skip_rows=1))

scores = compute_pollution(data, misinfo_sites, min_num_tweets)

if not os.path.exists(os.path.dirname(dest)):
    os.makedirs(os.path.dirname(dest))
write_csv(dest, [(uid, score) for uid, score in scores.items()], ['Twitter ID', 'Pollution'])

Skipped counts: {'retweet': 0, 'standalone': 0, 'quote': 0}


INFO:root:User 628801184 does not have the required minimum number of tweets (6/8). Skipping.
INFO:root:User 34320674 does not have the required minimum number of tweets (4/8). Skipping.
INFO:root:User 18851248 does not have the required minimum number of tweets (1/8). Skipping.
