# Daily Tweet Processing
Iterate tweets for a given day and perform some calculations

In [None]:
# required imports to access api_db, misc, misc.CONFIG, ...
import sys
sys.path = ['.', '..', '../..'] + sys.path
from collection import *

<hr>
<h1 align="center">driver code</h1>

1. Define tasks to execute daily
2. Check if these have been executed for each day of the past month
3. Execute the ones that have not yet been executed (this ensures that new ones will be retroactively updated)
4. 

---
Define tasks below - these should be replicated in the api (or maybe make it generably able to access any `task_%s` collection from any endpoint, assuming injection is not a concern atm)

In [None]:
# task 1 - count tweets by type
from collections import defaultdict
def count_tweets_by_type(day, recalculate=False):
    task = Task(api_db.db, "count by type")
    if task.exists_day(day) and not recalculate: return # already processed
    metrics = defaultdict(int, next(api_db.col_tweets.aggregate([
        {"$match": {"created_at": get_filter_by_day(day)}}, 
        {"$facet": {
            "retweet": [
                {"$match": {"retweeted_status": {"$exists": True}}},
                {"$count": "retweet"},
            ],
            "quote": [
                {"$match": {"quoted_status": {"$exists": True}}},
                {"$count": "quote"},
            ],
            "reply": [
                {"$match": {"in_reply_to_status_id": {"$exists": True}}},
                {"$count": "reply"},
            ],
            "original": [
                {"$match": {"original": True}},
                {"$count": "original"},
            ],
            "total": [
                {"$count": "total"},
            ]
        }},
        {"$project": {
            "retweet": {"$arrayElemAt": ["$retweet.retweet", 0]},
            "quote": {"$arrayElemAt": ["$quote.quote", 0]},
            "reply": {"$arrayElemAt": ["$reply.reply", 0]},
            "original": {"$arrayElemAt": ["$original.original", 0]},
            "total": {"$arrayElemAt": ["$total.total", 0]}
        }}
    ])))
    # force defaultdict to create 0
    [metrics[x] for x in ["retweet", "quote", "reply", "original", "total"]]
    task.insert(day, metrics)

In [None]:
# task 2 - measure fake news
def measure_fakenews(day, recalculate=False):
    task = Task(api_db.db, "measure fakenews")
    if task.exists_day(day) and not recalculate: return # already processed

    # helper function
    from urllib.parse import urlparse
    def netloc(url): return urlparse(url.strip()).netloc.replace("www.", "")

    def normalize_name(name): return name.replace(".", "-")

    # get fakenews sites
    with open(abs_path("../resources/fakenews.txt")) as inf: fakenews_sites = set(map(lambda s: s.strip(), inf.readlines()))
    # search query
    tweets = api_db.col_tweets.find({
        "urls":{"$exists": True},
        "created_at": get_filter_by_day(day)
        }, {"urls": True, "user": True, "favorite_count": True, "retweet_count": True})
    # collect
    metrics = {"total":0, "sites": {}, "favorite_count": 0, "retweet_count": 0}
    for fake in fakenews_sites: metrics["sites"][normalize_name(fake)]=0
    for t in tweets:
        for url in t["urls"]:
            loc = netloc(url["expanded_url"])
            if loc == "facebook.com":
                for fake in fakenews_sites:
                    if fake in url["expanded_url"]:
                        metrics["sites"][normalize_name(fake)]+=1
                        if "favorite_count" in t: metrics["favorite_count"]+=t["favorite_count"]
                        if "retweet_count" in t: metrics["retweet_count"]+=t["retweet_count"]
                        break
            elif loc in fakenews_sites:
                metrics["sites"][normalize_name(loc)]+=1
                if "favorite_count" in t: metrics["favorite_count"]+=t["favorite_count"]
                if "retweet_count" in t: metrics["retweet_count"]+=t["retweet_count"]
    metrics["total"] = sum(v for k, v in metrics["sites"].items())
    task.insert(day, metrics)

In [None]:
# task 3 - measure suspensions
def measure_suspensions(day, recalculate=False):
    task = Task(api_db.db, "measure suspensions")
    # force recalculation each day
    # if task.exists_day(day) and not recalculate: return # already processed

    metrics = {"total":0, "users": []}
    users = api_db.col_users.find({"suspended": True, "time_suspended": get_filter_by_day(day)},
     {"screen_name": True, "friends_count": True, "followers_count": True, "statuses_count": True, "description": True, "favourites_count": True, "created_at": True})
    
    for user in users:
        metrics["total"]+=1
        metrics["users"].append(user)

    task.insert(day, metrics)

In [None]:
# candidates
# candidates = [("AnaMartinsGomes", 771383605), ("AndreCVentura", 1097962618596327424), ("BrunoARFialho", 1221188948996739072), ("joao_ferreira33", 951055588330475520), ("mmatias_", 948552829), ("Marcelo Rebelo de Sousa", 0), ("LiberalMayan", 1286335166881964032), ("_tinoderans_", 4644839074)]
candidates = [("MarceloRebeloDeSousa", 0), ("AnaMartinsGomes", 771383605), ("AndreCVentura", 1097962618596327424), ("BrunoARFialho", 1221188948996739072), ("joao_ferreira33", 951055588330475520), ("mmatias_", 948552829), ("LiberalMayan", 1286335166881964032), ("_tinoderans_", 4644839074)]

In [None]:
hsterms = HatespeechTerms('../resources/hatespeech_terms.csv')

In [None]:
from collections import defaultdict
# measures hatespeech terms and insults in replies to candidates content
# aggregations are done on frontend side
def measure_hatespeech(day, recalculate=False):
    task = Task(api_db.db, "measure hatespeech")
    if task.exists_day(day) and not recalculate: return # already processed

    metrics = {"candidates": {}} # daily metrics
    # enrich metrics with each candidate's daily replies
    for user, _id in candidates:
        if _id <= 0: continue # ignore marcelo
        _id_str = str(_id)
        replies_to_candidate = api_db.col_tweets.find({"in_reply_to_user_id": _id, "created_at": get_filter_between_days(day, day)}, {"full_text": True})
        
        candidate_matches = []
        total_replies = 0
        for t in replies_to_candidate:
            total_replies+=1
            candidate_matches.extend(hsterms.find_all_words(t["full_text"]))
        
        metrics["candidates"][_id_str] = {"total_replies": total_replies, "hits": candidate_matches}
    
    task.insert(day, metrics)

In [None]:
def refresh_candidates_tweets():
    oldest_t = misc.CONFIG["collection"]["oldest_tweet"]
    for username, _id in candidates:
        if _id <= 0: continue # ignore marcelo
        with DoneMessage("Refreshing %s" % username):
            # refresh all tweets to get updated values for likes, ... since oldest_t
            tweets = get_tweets({"_id": _id}, api_db.api.GetUserTimeline, "since_id", oldest_t, {"trim_user":True})
            insert_tweets(tweets)
            print("refreshed %d tweets" % len(tweets), end="")
refresh_candidates_tweets()

In [None]:
# task 4 - measure presidential candidates
import re
def measure_candidates(day, recalculate=False):
    task = Task(api_db.db, "measure candidates")
    if task.exists_day(day) and not recalculate: return # already processed


    metrics = {"candidates": {}} # daily metrics
    # global_metrics = {} # only one doc in the collection, which is updated
    def get_tweet_type(tweet):
        if "original" in tweet: return "original"
        if "retweeted_status" in tweet: return "retweet"
        if "quoted_status" in tweet: return "quote"
        if "in_reply_to_user_id" in tweet or "in_reply_to_status_id" in tweet: return "reply"
        return "original"

    for user, _id in candidates:
        _id_str = str(_id)
        # account metrics for this day
        account = get_account_details(user_id=_id)
        if account:
            account = user_to_db_format(account)
            upsert_user(account)
        else: account = api_db.col_users.find_one({"_id": _id})
        metrics["candidates"][_id_str] = {
            "name": account["name"],
            "screen_name": account["screen_name"],
            "followers_count": account["followers_count"],
            "tweets": []
        }
        if "profile_image_url_https" in account: # adds profile picture
            metrics["candidates"][_id_str]["pic"] = account["profile_image_url_https"]
        # tweet metrics for this day
        tweets = api_db.col_tweets.find({"user": _id, "created_at": get_filter_by_day(day)},
                {"retweet_count": True, "favorite_count": True, "retweeted_status": True, "quoted_status": True, "in_reply_to_status_id": True, "in_reply_to_user_id": True, "original": True, "full_text": True, "created_at": True})
        for t in tweets:
            if "retweet_count" not in t: t["retweet_count"] = 0
            if "favorite_count" not in t: t["favorite_count"] = 0
            _tweet = {"_id": t["_id"], "retweet_count": t["retweet_count"], "favorite_count": t["favorite_count"], "type": get_tweet_type(t), "created_at": t["created_at"], "full_text": t["full_text"]}
            # if "original" in t or "quoted_status" in t: _tweet["full_text"] = t["full_text"]
            metrics["candidates"][_id_str]["tweets"].append(_tweet)
        # there was a bug with this metric so, since all the info is in the UI, it was fixed there because of historical data, however this query is now correct but will not be used for the current deployment, TODO: fix for later ones
        metrics["candidates"][_id_str]["tweet_impact"] = sum(t["retweet_count"] + t["favorite_count"] for t in metrics["candidates"][_id_str]["tweets"] if t["type"] != "retweet")

        # count name mentions and mentions
        candidate_name = ''.join([i for i in account["name"] if not i.isdigit()])
        regex_query = re.compile(candidate_name.replace(" ", ".{0,1}"), re.IGNORECASE)
        metrics["candidates"][_id_str]["name_mentions"] = api_db.col_tweets.count_documents({"created_at": get_filter_by_day(day), "full_text": regex_query})
        # count mentions
        metrics["candidates"][_id_str]["mentions"] = api_db.col_tweets.count_documents({"created_at": get_filter_by_day(day), "user_mentions": _id})
    # fixing fields that should not be updated historically
    current = task.find_day(day)
    if current:
        for user, _id in candidates:
            if _id <= 0: continue # ignore marcelo
            _id_str = str(_id)
            metrics["candidates"][_id_str]["followers_count"] = current["data"]["candidates"][_id_str]["followers_count"]
    task.insert(day, metrics)

In [None]:
# task 5 - log creation dates weeks (users)
from datetime import datetime
# from collections import defaultdict
def measure_creation_dates(day):
    task = Task(api_db.db, "measure creation dates", only_one=True)

    metrics = []
    entries = api_db.col_users.aggregate([
        { "$group" : {
            "_id": {
                "year" : { "$year" : "$created_at" },        
                "month" : { "$month" : "$created_at" }
                # , day : { $dayOfMonth : "$created_at" },
            },
            "count": { "$sum": 1 }
        }}
    ])
    for x in entries:
        metrics.append({
            "year": x["_id"]["year"],
            "month": x["_id"]["month"],
            "count": x["count"]
        })
    task.insert(day, metrics)
measure_creation_dates(datetime.now())

In [None]:
def jaccard_index(set1, set2):
    if len(set1 | set2) == 0: return 0
    return len(set1 & set2) / len(set1 | set2)

In [None]:
def how_many_of_1_follow_2(set1, set2):
    if len(set1) == 0: return 0
    return len(set1 & set2) / len(set1)

In [None]:
# task 6 - measure followers polarization
from datetime import datetime
from collections import defaultdict
def measure_polarization(day):
    task = Task(api_db.db, "measure followers polarization", only_one=True)

    # get all followers
    followers = defaultdict(set)
    for cname, _id in candidates:
        if _id <= 0: continue # ignore marcelo
        for follower_ids in paged_followers(_id):
            followers[_id].update(follower_ids)
    # debug #followers
    for k, v in followers.items(): print("%s has %s followers" % (k, len(v)))
    
    polarization = []
    for i, (cname, _id) in enumerate(candidates): # row
        row = []
        for j, (cname2, _id2) in enumerate(candidates): # col
            if j<=i: row.append(0)
            else: row.append(float("%.5f" % jaccard_index(followers[_id], followers[_id2])))
        polarization.append(row)

    ratios = []
    for i, (cname, _id) in enumerate(candidates): # row
        row = []
        for j, (cname2, _id2) in enumerate(candidates): # col
            if j==i: row.append(0)
            else: row.append(float("%.5f" % how_many_of_1_follow_2(followers[_id], followers[_id2])))
        ratios.append(row)

    # debug polarization
    print(" "*30, " ".join([c[0] for c in candidates]))
    [print("%30s" % candidates[i][0], ["%.4f" % x for x in p]) for i, p in enumerate(polarization)]

    task.insert(day, {"candidates": candidates, "polarization": polarization, "ratios": ratios})

measure_polarization(datetime.now())

# Main function that calls declared tasks
Each task must be manually registered

In [None]:
from datetime import datetime, timedelta

In [None]:
def main_caller(day):
    print("")
    day_diff = (datetime.now(day.tzinfo) - day).days
    recalculate = day_diff <= 10
    print("Recalculating: %s" % recalculate)
    with DoneMessage("   count_tweets_by_type"):count_tweets_by_type(day, recalculate)
    with DoneMessage("   measure_fakenews"):measure_fakenews(day, recalculate)
    with DoneMessage("   measure_suspensions"):measure_suspensions(day, recalculate)
    with DoneMessage("   measure_candidates"):measure_candidates(day, recalculate)
    with DoneMessage("   measure_hatespeech"):measure_hatespeech(day, recalculate)

In [None]:
day = misc.CONFIG["collection"]["oldest_tweet"]
# process every day from start to prev-yesterday (today only when whole day has gone by)
while day.date() + timedelta(days=1) < datetime.now(day.tzinfo).date():
    day+=timedelta(days=1)
    with DoneMessage("Processing day %s" % day):
        main_caller(day)

In [None]:
print("DONE")