In [1]:
import os
import csv
import json
from operator import itemgetter
from IPython.display import display, Markdown, Latex

In [2]:
DATASET_FILE = os.path.join(os.getenv('D'), 'stripped-dataset-no-bots.json')
NEWS_SOURCES_FILE = os.path.join(os.getenv('D'), 'sources', 'news.tab')
POLLUTION_FILE = os.path.join(os.getenv('D'), 'sources', 'pollution.tab')

In [3]:
def load_dataset(filepath):
    with open(filepath, 'r') as f:
        return {int(uid): shares for uid, shares in json.loads(f.read()).items()}
    
def load_news_sources(filepath):
    sources = {}
    with open(filepath, 'r') as f:
        r = csv.reader(f, delimiter='\t')
        next(r)
        for row in r:
            assert row[0] not in sources
            sources[row[0]] = float(row[1])
    return sources

def load_pollution_sources(filepath):
    sources = {}
    with open(filepath, 'r') as f:
        r = csv.reader(f, delimiter='\t')
        next(r)
        for row in r:
            assert row[0] not in sources
            cats = [cat.lower() for cat in row[1:] if cat is not None and cat != '']
            if len(cats) == 0:
                print('Problem: {}'.format(row))
            sources[row[0]] = cats
    return sources

def compute_pollution_cat_counts(pollution, primary_only=False):
    counts = {}
    for url, all_cats in pollution.items():
        if primary_only:
            cats = [all_cats[0]]
        else:
            cats = all_cats
            
        for cat in cats:
            if cat not in counts:
                counts[cat] = 0
            counts[cat] += 1
    return counts

def compute_partisanship(data):
    for uid in data:
        pass

In [4]:
data = load_dataset(DATASET_FILE)
display(Markdown('## Number of users: {}'.format(len(data))))

domains = {}
for uid, tweets in data.items():
    for tweet in tweets:
        for d in tweet['domains']:
            if d not in domains:
                domains[d] = 0
            domains[d] += 1
display(Markdown('## Number of unique domains: {}'.format(len(domains))))
display(Markdown('## Number of total shares: {}'.format(sum(domains.values()))))

## Number of users: 15057

## Number of unique domains: 47442

## Number of total shares: 1473250

In [5]:
news_sources = load_news_sources(NEWS_SOURCES_FILE)
display(Markdown('## Number of news sources: {}'.format(len(news_sources))))

news_domains = {k: v for k, v in domains.items() if k in news_sources}
display(Markdown('## Number of news sources in data: {}'.format(len(news_domains))))
display(Markdown('## Number of news shares in data: {} ({:.2f} of all)'.format(
    sum(news_domains.values()), 
    sum(news_domains.values()) / sum(domains.values())
)))

## Number of news sources: 488

## Number of news sources in data: 418

## Number of news shares in data: 469966 (0.32 of all)

In [6]:
pollution_sources = load_pollution_sources(POLLUTION_FILE)
display(Markdown('## Number of pollution sources: {}'.format(len(pollution_sources))))

pollution_domains = {k: v for k, v in domains.items() if k in pollution_sources}
display(Markdown('## Number of pollution sources in data: {}'.format(len(pollution_domains))))
display(Markdown('## Number of pollution shares in data: {} ({:.2f} of all)'.format(
    sum(pollution_domains.values()),
    sum(pollution_domains.values()) / sum(domains.values())
)))


## Number of pollution sources: 824

## Number of pollution sources in data: 437

## Number of pollution shares in data: 252948 (0.17 of all)

In [7]:
all_sources = set()
all_sources.update(set(news_sources.keys()))
all_sources.update(pollution_sources.keys())
overlap = set(news_sources.keys()) & set(pollution_sources.keys())
display(Markdown('## Combined sources: {}, Only news: {}, only pollution: {}, overlap: {}'.format(
    len(all_sources),
    len(set(news_sources.keys()) - overlap),
    len(set(pollution_sources.keys()) - overlap),
    len(overlap)
)))

all_domains = {}
all_domains.update(news_domains)
all_domains.update(pollution_domains)
display(Markdown('## News and pollution domains in data: {}'.format((len(all_domains)))))

## Combined sources: 1202, Only news: 378, only pollution: 714, overlap: 110

## News and pollution domains in data: 757

In [8]:
pollution_cat_counts = compute_pollution_cat_counts(pollution_sources)
display(Markdown('<br />'.join([row[0] for row in sorted(pollution_cat_counts.items(), key=itemgetter(1), reverse=True)])))

fake<br />bias<br />conspiracy<br />satire<br />unreliable<br />clickbait<br />political<br />junksci<br />hate<br /> <br />rumor<br />reliable<br />fake news<br />state<br />2.62e-05<br />blog<br />satirical<br />8.55e-05<br />rumor <br /> unreliable<br />5.16e-05<br />3.64e-05<br />3.11e-05<br />unrealiable<br />fake 