# Daily Tweet Processing
Iterate tweets for a given day and perform some calculations

In [1]:
# required imports to access api_db, misc, misc.CONFIG, ...
import sys
sys.path = ['.', '..', '../..'] + sys.path
from collection import *

using 105 seed accounts (20 from news sources) and 0 hashtags
Database stats: 
{'avgObjSize': 102.54450983689254,
 'collections': 3,
 'dataSize': 6098322.0,
 'db': 'electionswatch',
 'fsTotalSize': 510770802688.0,
 'fsUsedSize': 444997160960.0,
 'indexSize': 1077248.0,
 'indexes': 3,
 'numExtents': 0,
 'objects': 59470,
 'ok': 1.0,
 'scaleFactor': 1.0,
 'storageSize': 3387392.0,
 'views': 0}
DB size (B): 6098322.0
DB size (MB): 5.82
DB size (GB): 0.01
Using API keys for app albertina_01

Done initializing at 10:04PM on September 26, 2020.
----------------------------------------


<hr>
<h1 align="center">driver code</h1>

1. Define tasks to execute daily
2. Check if these have been executed for each day of the past month
3. Execute the ones that have not yet been executed (this ensures that new ones will be retroactively updated)
4. 

---
Define tasks below - these should be replicated in the api (or maybe make it generably able to access any `task_%s` collection from any endpoint, assuming injection is not a concern atm)

In [2]:
# task 1 - count tweets by type
from collections import defaultdict
def count_tweets_by_type(day):
    task = Task(api_db.db, "count by type")
    if task.exists_day(day): return # already processed
    metrics = defaultdict(int, next(api_db.col_tweets.aggregate([
        {"$match": {"created_at": get_filter_by_day(day)}}, 
        {"$facet": {
            "retweet": [
                {"$match": {"retweeted_status": {"$exists": True}}},
                {"$count": "retweet"},
            ],
            "quote": [
                {"$match": {"quoted_status": {"$exists": True}}},
                {"$count": "quote"},
            ],
            "reply": [
                {"$match": {"in_reply_to_status_id": {"$exists": True}}},
                {"$count": "reply"},
            ],
            "original": [
                {"$match": {"original": True}},
                {"$count": "original"},
            ],
            "total": [
                {"$count": "total"},
            ]
        }},
        {"$project": {
            "retweet": {"$arrayElemAt": ["$retweet.retweet", 0]},
            "quote": {"$arrayElemAt": ["$quote.quote", 0]},
            "reply": {"$arrayElemAt": ["$reply.reply", 0]},
            "original": {"$arrayElemAt": ["$original.original", 0]},
            "total": {"$arrayElemAt": ["$total.total", 0]}
        }}
    ])))
    # force defaultdict to create 0
    [metrics[x] for x in ["retweet", "quote", "reply", "original", "total"]]
    task.insert(day, metrics)

In [42]:
# task 2 - measure fake news
from collections import defaultdict, Counter
def measure_fakenews(day):
    task = Task(api_db.db, "measure fakenews")
    if task.exists_day(day): return # already processed

    # helper function
    from urllib.parse import urlparse
    def netloc(url): return urlparse(url.strip()).netloc.replace("www.", "")

    def normalize_name(name): return name.replace(".", "-")

    # get fakenews sites
    with open("fakenews.txt") as inf: fakenews_sites = set(map(lambda s: s.strip(), inf.readlines()))
    # search query
    tweets = api_db.col_tweets.find({
        "urls":{"$exists": True},
        "created_at": get_filter_by_day(day)
        }, {"urls": True, "user": True})
    # collect
    metrics = {"total":0, "sites": {}}
    for fake in fakenews_sites: metrics["sites"][normalize_name(fake)]=0
    for t in tweets:
        for url in t["urls"]:
            loc = netloc(url["expanded_url"])
            if loc == "facebook.com":
                for fake in fakenews_sites:
                    if fake in url["expanded_url"]:
                        metrics["sites"][normalize_name(fake)]+=1
                        break
            elif loc in fakenews_sites:
                metrics["sites"][normalize_name(loc)]+=1
    metrics["total"] = sum(v for k, v in metrics["sites"].items())
    task.insert(day, metrics)

# Main function that calls declared tasks
Each task must be manually registered

In [43]:
from datetime import datetime, timedelta

In [44]:
def main_caller(day):
    print("")
    with DoneMessage("   count_tweets_by_type"):count_tweets_by_type(day)
    with DoneMessage("   measure_fakenews"):measure_fakenews(day)

In [45]:
day = misc.CONFIG["collection"]["oldest_tweet"]
# process every day from start to yesterday (today only when whole day has gone by)
while day.date() + timedelta(days=1) < datetime.now(day.tzinfo).date():
    day+=timedelta(days=1)
    with DoneMessage("Processing day %s" % day):
        main_caller(day)

Processing day 2020-09-02 00:00:00+00:00...
   count_tweets_by_type...Done in 0.003s.
   measure_fakenews...Done in 0.030s.
Done in 0.038s.
Processing day 2020-09-03 00:00:00+00:00...
   count_tweets_by_type...Done in 0.002s.
   measure_fakenews...Done in 0.018s.
Done in 0.024s.
Processing day 2020-09-04 00:00:00+00:00...
   count_tweets_by_type...Done in 0.002s.
   measure_fakenews...Done in 0.022s.
Done in 0.027s.
Processing day 2020-09-05 00:00:00+00:00...
   count_tweets_by_type...Done in 0.003s.
   measure_fakenews...Done in 0.017s.
Done in 0.023s.
Processing day 2020-09-06 00:00:00+00:00...
   count_tweets_by_type...Done in 0.002s.
   measure_fakenews...Done in 0.021s.
Done in 0.025s.
Processing day 2020-09-07 00:00:00+00:00...
   count_tweets_by_type...Done in 0.002s.
   measure_fakenews...Done in 0.015s.
Done in 0.021s.
Processing day 2020-09-08 00:00:00+00:00...
   count_tweets_by_type...Done in 0.002s.
   measure_fakenews...Done in 0.016s.
Done in 0.021s.
Processing day 2020-

In [None]:
# print(get_account_details(20509689))

In [None]:
# from datetime import timezone
# find_params = find_exclude_invalid({
#     "depth": 0
# })
# oldest_t = misc.CONFIG["collection"]["oldest_tweet"]
# # users = api_db.col_users.find(find_params, no_cursor_timeout=True)
# users = [{"_id": 8665852}]
# for u in users:
#     print("getting tweets for: %s..." % u["_id"], end="", flush=True)
#     tweets = get_tweets(u, api_db.api.GetUserTimeline, "since_id",  datetime(2020, 7, 18, tzinfo=timezone.utc), {"trim_user":True})
#     # insert_tweets(tweets)
#     # update_most_common_language(u, tweets) # removed because not useful here
#     print("got %d new tweets, done." % len(tweets))
#     if len(tweets) > 0: break

In [None]:
# # [print(str(t) + "\n") for t in tweets[-10:-1]]
# for t in tweets[0:10000]:
#     if "in_reply_to_user_id" in t:
#         print(t)

In [None]:
# misc.CONFIG["collection"]["oldest_tweet"]

In [None]:

# for tweet in api_db.col_tweets.find({"created_at": {"$gte": _from, "$lt": _to}}, no_cursor_timeout=True):
#     print(tweet)
#     break

In [None]:
print("DONE")