In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import sys
import os
import csv
import json
import logging
from common_utils import readcol, write_csv, filter_dataset
from urls import domain

In [18]:
DATA_DIR = os.path.join(os.getenv('HOME'), 'data', 'hks')
MIN_NUM_TWEETS = 8

In [17]:
def read_polscores(filepath):
    polscores = {}
    with open(filepath, 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader) # skip header

        for row in reader:
            if row[0].strip().lower() in polscores:
                logging.debug('Polscore already read for {}'.format(row[0]))
            else:
                polscores[row[0].strip().lower()] = float(row[1])
    return polscores

In [22]:
def compute_misinfo(data, misinfo_sites, min_num_tweets):
    total_counts = {}
    misinfo_counts = {}
    for uid in data:
        assert uid not in total_counts
        assert uid not in misinfo_counts
    
        total_counts[uid] = 0
        misinfo_counts[uid] = 0
        
        for tweet in data[uid]:
            for raw_d in tweet['domains']:
                total_counts[uid] += 1
                d = domain(raw_d)
                if d in misinfo_sites:
                    misinfo_counts[uid] += 1

    scores = {}
    for uid in misinfo_counts:
        if total_counts[uid] < min_num_tweets:
            logging.info('User {} does not have the required minimum number of tweets ({}/{}). Skipping.'.format(
                uid, total_counts[uid], min_num_tweets
            ))
            continue
        else:
            scores[uid] = misinfo_counts[uid] / total_counts[uid]
    return scores

In [26]:
def compute_partisanship(data, polscores, min_num_tweets, take_abs=False):
    total_counts = {}
    news_visits = {}
    for uid in data:
        assert uid not in total_counts
        assert uid not in news_visits
        
        total_counts[uid] = 0
        news_visits[uid] = {}
        
        for tweet in data[uid]:
            for raw_domain in tweet['domains']:
                total_counts[uid] += 1
                d = domain(raw_domain)
                if d in polscores:
                    if d not in news_visits[uid]:
                        news_visits[uid][d] = 0
                    news_visits[uid][d] += 1

    scores = {}
    for uid in news_visits:
        total_news = sum(news_visits[uid].values())
        total = total_counts[uid]

        if total_news < min_num_tweets:
            print('User {} does not have the required minimum number of tweets ({}/{}). Skipping.'.format(
                uid, total_news, min_num_tweets
            ))
            continue
        else:
            scores[uid] = 0
            for url in news_visits[uid]:
                denom = total # denom = total_news
                if take_abs:
                    scores[uid] += (news_visits[uid][url] / denom) * abs(polscores[url])
                else: 
                    scores[uid] += (news_visits[uid][url] / denom) * polscores[url]
                
                #
    return scores

In [16]:
misinfo_sites = frozenset(readcol(os.path.join(DATA_DIR, 'misinfo.tab'), skip_rows=1))
polscores = read_polscores(os.path.join(DATA_DIR, 'top500.tab'))
with open(os.path.join(DATA_DIR, 'anonymized-shares.json'), 'r') as f:
    shares = json.loads(f.read())

In [25]:
misinfo = compute_misinfo(shares, misinfo_sites, MIN_NUM_TWEETS)
print(misinfo)

{'96195': 0.24858757062146894, '37978': 0.5515151515151515, '14644': 0.009259259259259259, '60654': 0.2733812949640288, '13142': 0.03636363636363636, '12657': 0.024390243902439025, '113862': 0.1069182389937107, '4522': 0.2988505747126437, '82543': 0.22033898305084745, '123956': 0.37142857142857144, '106401': 0.06451612903225806, '73624': 0.08, '83317': 0.23255813953488372, '55031': 0.16, '23496': 0.15625, '8393': 0.08, '136807': 0.38461538461538464, '11459': 0.4421052631578947, '129347': 0.029850746268656716, '72809': 0.29508196721311475, '122110': 0.3333333333333333, '99759': 0.21621621621621623, '127248': 0.09803921568627451, '37007': 0.2857142857142857, '17547': 0.09615384615384616, '18104': 0.3103448275862069, '31096': 0.019230769230769232, '61491': 0.13513513513513514, '12211': 0.022222222222222223, '32139': 0.18181818181818182, '81697': 0.06976744186046512, '10613': 0.6875, '118840': 0.06944444444444445, '45025': 0.10309278350515463, '111870': 0.19886363636363635, '136002': 0.038

In [28]:
partisanship = compute_partisanship(shares, polscores, MIN_NUM_TWEETS, take_abs=False)
print(partisanship)

User 29938 does not have the required minimum number of tweets (5/8). Skipping.
{'96195': 0.18867344632768362, '37978': 0.35509939393939394, '14644': -0.0973046296296296, '60654': 0.22714604316546766, '13142': -0.15374, '12657': -0.18983170731707322, '113862': -0.11749308176100628, '4522': 0.19145632183908043, '82543': -0.3716830508474578, '123956': 0.32623142857142856, '106401': -0.13101290322580647, '73624': -0.092322, '83317': 0.31545348837209297, '55031': -0.243424, '23496': -0.2218375, '8393': 0.55244, '136807': -0.4908096153846154, '11459': -0.2783968421052632, '129347': -0.1001358208955224, '72809': 0.19442622950819674, '122110': 0.3494518518518518, '99759': -0.06335585585585585, '127248': -0.1664745098039216, '37007': 0.19370476190476188, '17547': -0.13740000000000002, '18104': 0.2545637931034483, '31096': -0.075425, '61491': -0.18474054054054057, '12211': -0.18059333333333333, '32139': 0.0705753246753247, '81697': -0.06757209302325581, '10613': -0.10229999999999999, '118840': 

In [31]:
common = partisanship.keys() & misinfo.keys()
print(len(partisanship))
print(len(misinfo))
print(len(common))

15056
15057
15056


In [32]:
write_csv(os.path.join(DATA_DIR, 'measures.tab'), [(uid, partisanship[uid], misinfo[uid]) for uid in common], ['ID', 'Partisanship', 'Misinformation'])