# Daily Tweet Processing
Iterate all unprocessed tweets (processed tweets are marked with `processed=True`) and increment their `appearance` count in the database, namely:
* all **mentioned** users
* all **authors** (can come from different sources so a good amount of users won't be of depth=0 or depth=1) 
* all **authors** of retweeted tweets
* all **authors** of quoted tweets (retweet + comment)
* all **authors** of replied tweets

In [None]:
# required imports to access api_db, misc, misc.CONFIG, ...
import sys
sys.path = ['.', '..', '../..'] + sys.path
from collection import *

### Conditional Execution
Each file needs to verify if it should be executed or not based on the configurations (for some files this is not optional but all should have this section, even if it is tautological). Example:
```python
if not misc.CONFIG["collection"]["execute_this_script"]: exit()
```

In [None]:
# Conditional execution
pass

<hr>
<h1 align="center">driver code</h1>

In [None]:
# test getting a tweet that is a retweet of a quote and format it
# _id= 1210429083273891841
# rt = api_db.api.GetStatus(_id)
# print(rt)
# tweet_to_db_format(rt)

In [None]:
# ensure mongodb tweets find by id is working (I has some problems with robot3T)
# next(api_db.col_tweets.find({"_id": 1245814925953339397}))

In [None]:
def task(skip, limit):
    processed_key = "processed"

    find_params = {processed_key: False}
    retrieve_params = {"user_mentions": True, "user": True, "retweeted_status.user" :True, "quoted_status.user": True, "in_reply_to_user_id": True}
    tweets = api_db.col_tweets.find(find_params, retrieve_params, no_cursor_timeout=True).skip(skip).limit(limit)

    print("Processing tweets (silent db-only)...", end="", flush=True)
    count = 0
    for t in tweets:
        # fill user_ids list from differnt tweet information
        user_ids = [t["user"]]
        if "user_mentions" in t: user_ids.extend(t["user_mentions"])
        if "retweeted_status" in t: user_ids.append(t["retweeted_status"]["user"])
        if "quoted_status" in t: user_ids.append(t["quoted_status"]["user"])
        if "in_reply_to_user_id" in t: user_ids.append(t["in_reply_to_user_id"])
        # remove duplicates, update user counts, mark tweet as procesed
        user_ids = list(set(user_ids)) # they should not count twice
        upsert_user_ids_appearances(user_ids)
        upsert_tweet_info({"_id": t["_id"], processed_key: True})
        count+=1
    print("processed %d new tweets, done." % (count))

In [None]:
total = api_db.col_tweets.count({"processed": False})
print("Total to process: %d" % total)

In [None]:
dp = DynamicParallelism(total, task, "tweet_processing", batch_size=5e6, max_threads=misc.CONFIG["collection"]["max_threads"])

In [None]:
dp.run()

In [None]:
print("DONE")