# Daily Tweet Collection
Iterate all existing user accounts and get all their tweets since `since_id` (last collected tweet, ideally the day before) or `misc.CONIFG["oldest_tweet"]`

Additionally:
* identify suspensions on a daily basis and mark the suspension day.

In [None]:
# required imports to access api_db, misc, misc.CONFIG, ...
import sys
sys.path = ['.', '..', '../..'] + sys.path
from collection import *

### Conditional Execution
Each file needs to verify if it should be executed or not based on the configurations (for some files this is not optional but all should have this section, even if it is tautological). Example:
```python
if not misc.CONFIG["collection"]["execute_this_script"]: exit()
```

In [None]:
# Conditional execution
pass

<hr>
<h1 align="center">driver code</h1>

The users where we will search for tweets are users that either don't have a `most_common_language` yet or whose `most_common_language` is in `config.collection.search_languages` and with `depth<=2`

In [None]:
def task(skip, limit):
    from collections import Counter

    oldest_t = misc.CONFIG["collection"]["oldest_tweet"]
    since_id_key = "since_id"
    print("Collection with oldest tweet at %s and key for since_id '%s'" % (oldest_t, since_id_key))

    min_tweets_lang = misc.CONFIG["collection"]["min_tweets_before_restricting_by_language"]
    def update_most_common_language(user, tweets):
        # assumes tweets are all from the same user
        # returns dict of {lang:count}
        if not len(tweets): return
        lang = "tweeted_languages"
        user[lang] = dict_key_or_default(user, lang, {})
        new_langs = Counter(map(lambda x: dict_key_or_default(x, "lang", "und"), tweets))
        for k, v in new_langs.items():
            user[lang][k] = dict_key_or_default(user[lang], k,0) + v
        total_tweets = sum(user[lang].values())
        # only include most_common_language if user has passed the tweet threshold
        # defined by config.collection.min_tweets_before_restricting_by_language
        if total_tweets >= min_tweets_lang:
            user["most_common_language"] = dict_key_for_max_val(user[lang])
        upsert_user(user)

    search_languages = misc.CONFIG["collection"]["search_languages"]
    find_params = find_exclude_invalid({
        "$or" : [
            {"most_common_language": {"$exists": 0}}, 
            {"most_common_language": {"$in": search_languages}}
        ],
        "depth": {"$lte": 2}
    })
    users = api_db.col_users.find(find_params, {since_id_key: True}, no_cursor_timeout=True).skip(skip).limit(limit)

    for u in users:
        print("getting tweets for: %s..." % u["_id"], end="", flush=True)
        tweets = get_tweets(u, api_db.api.GetUserTimeline, since_id_key, oldest_t, {"trim_user":True})
        insert_tweets(tweets)
        update_most_common_language(u, tweets)
        print("got %d new tweets, done." % len(tweets))

In [None]:
search_languages = misc.CONFIG["collection"]["search_languages"]
find_params = find_exclude_invalid({
    "$or" : [
        {"most_common_language": {"$exists": 0}}, 
        {"most_common_language": {"$in": search_languages}}
    ],
    "depth": {"$lte": 2}
})
total = api_db.col_users.count(find_params)
print("Total to process: %d" % total)

In [None]:
dp = DynamicParallelism(total, task, "tweet_collection", batch_size=50_000, max_threads=misc.CONFIG["collection"]["max_threads"])

In [None]:
dp.run()

In [None]:
print("DONE")