# Daily Tweet Processing
Iterate tweets for a given day and perform some calculations

In [2]:
# required imports to access api_db, misc, misc.CONFIG, ...
import sys
sys.path = ['.', '..', '../..'] + sys.path
from collection import *

using 125 seed accounts and 0 hashtags
Database stats: 
{'avgObjSize': 500.6761813384505,
 'collections': 4,
 'dataSize': 14611233.0,
 'db': 'electionswatch',
 'fsTotalSize': 510770802688.0,
 'fsUsedSize': 445155201024.0,
 'indexSize': 1961984.0,
 'indexes': 8,
 'numExtents': 0,
 'objects': 29183,
 'ok': 1.0,
 'scaleFactor': 1.0,
 'storageSize': 7831552.0,
 'views': 0}
DB size (B): 14611233.0
DB size (MB): 13.93
DB size (GB): 0.01
Using API keys for app twitta-mos

Done initializing at 06:46PM on September 24, 2020.
----------------------------------------


<hr>
<h1 align="center">driver code</h1>

1. Define tasks to execute daily
2. Check if these have been executed for each day of the past month
3. Execute the ones that have not yet been executed (this ensures that new ones will be retroactively updated)
4. 

In [3]:
from slugify import slugify
import uuid

In [4]:
from datetime import datetime, timezone, timedelta

In [93]:
def get_filter_by_day(day):
    # expects datetime
    day_start = day.replace(hour=0, minute=0, second=0, microsecond=0)
    day_end = day.replace(hour=23, minute=59, second=59, microsecond=0)
    return {"$gte": day_start, "$lt": day_end}


In [94]:
class Task:
    # handles a collection task_name
    # each document is {_id: uuid, day: day of analysis, data: what to store for this day}
    def __init__(self, name):
        self.name = slugify(name, separator="_")
        self.collection = api_db.db["task_%s" % self.name]
        # self.collection.create_index("day") # create index on day

    def exists_day(self, day):
        return self.collection.find_one({"day": get_filter_by_day(day)}) is not None
    
    def insert(self, day, data):
        return self.collection.insert({"_id": str(uuid.uuid4()), "day": day, "data": data})

    def get_last_n(self, n=30):
        assert n>0, "n must be greater than 0"
        # retrieve last n entries concatenated
        # self.collection.find({})
        return list(self.collection.find({}).sort([("day",-1)]).limit(n))

    def unzip_last_n(self, n=30):
        # returns two lists [days], [datas]
        unzip = list(zip(*[(x["day"], x["data"]) for x in self.get_last_n(n)]))
        return unzip[0], unzip[1]

    def get_last(self): return self.get_last_n(1)[0]

In [95]:
datetime.today(), date.today()

(datetime.datetime(2020, 9, 23, 22, 4, 34, 974730), datetime.date(2020, 9, 23))

In [97]:
t = Task("teste")
# t.insert(datetime.now(), "lol")
print(t.exists_day(datetime.today()))

True


In [113]:
from collections import defaultdict

---
Define tasks below - these should be replicated in the api (or maybe make it generably able to access any `task_%s` collection from any endpoint, assuming injection is not a concern atm)

In [119]:
# task 1 - count tweets by type
def count_tweets_by_type(day):
    task = Task("count by type")
    if task.exists_day(day): return # already processed
    metrics = defaultdict(int, next(api_db.col_tweets.aggregate([
        {"$match": {"created_at": get_filter_by_day(day)}}, 
        {"$facet": {
            "retweets": [
                {"$match": {"retweeted_status": {"$exists": True}}},
                {"$count": "retweets"},
            ],
            "quotes": [
                {"$match": {"quoted_status": {"$exists": True}}},
                {"$count": "quotes"},
            ],
            "replies": [
                {"$match": {"in_reply_to_status_id": {"$exists": True}}},
                {"$count": "replies"},
            ],
            "total": [
                {"$count": "total"},
            ]
        }},
        {"$project": {
            "retweets": {"$arrayElemAt": ["$retweets.retweets", 0]},
            "quotes": {"$arrayElemAt": ["$quotes.quotes", 0]},
            "replies": {"$arrayElemAt": ["$replies.replies", 0]},
            "total": {"$arrayElemAt": ["$total.total", 0]}
        }}
    ])))
    metrics["original"] = metrics["total"] - sum(metrics[x] for x in ["retweets", "quotes", "replies"])
    # metrics = dict(metrics)
    task.insert(day, metrics)

# Main function that calls declared tasks
Each task must be manually registered

In [120]:
def main_caller(day):
    with DoneMessage("Processing day %s" % day):
        count_tweets_by_type(day)

In [121]:
for day in (datetime(2020, 9, 1) + timedelta(n) for n in range(31)):
    main_caller(day)

Processing day 2020-09-01 00:00:00...Done in 0.004s.
Processing day 2020-09-02 00:00:00...Done in 0.002s.
Processing day 2020-09-03 00:00:00...Done in 0.002s.
Processing day 2020-09-04 00:00:00...Done in 0.003s.
Processing day 2020-09-05 00:00:00...Done in 0.003s.
Processing day 2020-09-06 00:00:00...Done in 0.003s.
Processing day 2020-09-07 00:00:00...Done in 0.002s.
Processing day 2020-09-08 00:00:00...Done in 0.003s.
Processing day 2020-09-09 00:00:00...Done in 0.002s.
Processing day 2020-09-10 00:00:00...Done in 0.001s.
Processing day 2020-09-11 00:00:00...Done in 0.002s.
Processing day 2020-09-12 00:00:00...Done in 0.001s.
Processing day 2020-09-13 00:00:00...Done in 0.002s.
Processing day 2020-09-14 00:00:00...Done in 0.003s.
Processing day 2020-09-15 00:00:00...Done in 0.003s.
Processing day 2020-09-16 00:00:00...Done in 0.002s.
Processing day 2020-09-17 00:00:00...Done in 0.002s.
Processing day 2020-09-18 00:00:00...Done in 0.003s.
Processing day 2020-09-19 00:00:00...Done in 0

In [7]:
print(get_account_details(20509689))

{"created_at": "Tue Feb 10 12:25:47 +0000 2009", "description": "Twitter del principal partido pol\u00edtico de Espa\u00f1a. Ay\u00fadanos a construir un pa\u00eds mejor. Participa en nuestra apasionante tarea.", "favourites_count": 43699, "followers_count": 804968, "friends_count": 6936, "geo_enabled": true, "id": 20509689, "id_str": "20509689", "listed_count": 4767, "location": "Madrid, Espa\u00f1a", "name": "Partido Popular", "profile_background_color": "ECF5FC", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "profile_banner_url": "https://pbs.twimg.com/profile_banners/20509689/1589281691", "profile_image_url": "http://pbs.twimg.com/profile_images/1296325919313985536/pJ9JrJBg_normal.jpg", "profile_image_url_https": "https://pbs.twimg.com/profile_images/1296325919313985536/pJ9JrJBg_normal.jpg", "profile_link_color": "00579F", "profile_sidebar_border_color": 

In [15]:
find_params = find_exclude_invalid({
    "depth": 0
})
oldest_t = misc.CONFIG["collection"]["oldest_tweet"]
# users = api_db.col_users.find(find_params, no_cursor_timeout=True)
users = [{"_id": 2880932962}]
for u in users:
    print("getting tweets for: %s..." % u["_id"], end="", flush=True)
    tweets = get_tweets(u, api_db.api.GetUserTimeline, "since_id",  datetime(2020, 7, 18, tzinfo=timezone.utc), {"trim_user":True})
    # insert_tweets(tweets)
    # update_most_common_language(u, tweets) # removed because not useful here
    print("got %d new tweets, done." % len(tweets))
    if len(tweets) > 0: break

getting tweets for: 2880932962...Error for 2880932962 with error [{'message': 'Rate limit exceeded', 'code': 88}] with API Keys from 'feupadog3' (get_tweets:GetUserTimeline)
Using API keys for app feupg4
Error for 2880932962 with error [{'message': 'Rate limit exceeded', 'code': 88}] with API Keys from 'feupg4' (get_tweets:GetUserTimeline)
Using API keys for app feupg5
Error for 2880932962 with error [{'message': 'Rate limit exceeded', 'code': 88}] with API Keys from 'feupg5' (get_tweets:GetUserTimeline)
Using API keys for app feupg7
Error for 2880932962 with error [{'message': 'Rate limit exceeded', 'code': 88}] with API Keys from 'feupg7' (get_tweets:GetUserTimeline)
Using API keys for app feupadog4
Error for 2880932962 with error [{'message': 'Rate limit exceeded', 'code': 88}] with API Keys from 'feupadog4' (get_tweets:GetUserTimeline)
Using API keys for app feupadog5
Error for 2880932962 with error [{'message': 'Rate limit exceeded', 'code': 88}] with API Keys from 'feupadog5' (ge

In [24]:
# [print(str(t) + "\n") for t in tweets[-10:-1]]
for t in tweets[0:10000]:
    if "in_reply_to_user_id" in t:
        print(t)

{'created_at': datetime.datetime(2020, 8, 26, 21, 55, 46, tzinfo=datetime.timezone.utc), 'favorite_count': 63, 'full_text': '@mariosabinof ao vivo agora no #PapoAntagonista com \n@FMouraBrasil &gt;  https://t.co/rHHGqo7XOt https://t.co/8jb6hz9gAT', 'hashtags': ['PapoAntagonista'], 'in_reply_to_user_id': 2758948984, 'lang': 'pt', 'retweet_count': 10, 'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>', 'urls': [{'expanded_url': 'https://oantagonista.com/especial/ao-vivo-papo-antagonista-com-felipe-moura-brasil-estreia/', 'url': 'https://t.co/rHHGqo7XOt'}], 'user': 2880932962, 'user_mentions': [2758948984, 52849416], 'processed': False, '_id': 1298740951419101187, 'collected_at': datetime.datetime(2020, 9, 24, 17, 21, 6, 205217)}


### processing steps for each day
* 

In [5]:
ids = [10228272,59591856,1097962618596327424,25073877,128372940,140034129,548469532,21390437,18839785,460343731,8802752,14594813,2573549978,550281859,9317502,939091,718009445863788544,1052575513007587328,70799317,420535612,18670794,3436726391,977668818045751297,759001618884939776,1053687552559980546,912230745057431552,523386042,1153514249681014784,1146329871418843136,1135281351677161472,3031071234,17535349,1210301128350584833,257075159,1297365654455156736,482866177,1706955559,939472695724568579,2790564174,1690412382,33520195,548741359,143538142,68740712,1041852964959383552,781067873737404416,14594698,45409867,40095953,7996082]
for _id in ids:
    uu = api_db.get_account_details(user_id=_id).AsDict()
    print("https://twitter.com/%s" % uu["screen_name"])

https://twitter.com/YouTube
https://twitter.com/Flamengo
https://twitter.com/AndreCVentura
https://twitter.com/realDonaldTrump
https://twitter.com/jairbolsonaro
https://twitter.com/IdiazAyuso
https://twitter.com/RicFazeres
https://twitter.com/SLBenfica
https://twitter.com/narendramodi
https://twitter.com/marcosbrazrio
https://twitter.com/g1
https://twitter.com/folha
https://twitter.com/kheayf
https://twitter.com/TiniStoessel
https://twitter.com/Estadao
https://twitter.com/JoeBiden
https://twitter.com/antoniocostapm
https://twitter.com/lmtf0981
https://twitter.com/UOL
https://twitter.com/FoxSportsBrasil
https://twitter.com/a_john
https://twitter.com/insoniascarvao
https://twitter.com/yourcenar1234
https://twitter.com/JanainaDoBrasil
https://twitter.com/Celinha741
https://twitter.com/Adriana_Az27
https://twitter.com/pablocasado_
https://twitter.com/missjulialee
https://twitter.com/vonderleyen
https://twitter.com/YesPISTLiberty
https://twitter.com/TeamYouTube
https://twitter.com/IainColle

In [12]:
_from = datetime(2020, 7, 18)
_to = datetime(2020, 8, 25)
print(_from, _to)

2020-07-18 00:00:00 2020-08-25 00:00:00


In [13]:
misc.CONFIG["collection"]["oldest_tweet"]

datetime.datetime(2020, 9, 1, 0, 0, tzinfo=datetime.timezone.utc)

In [14]:

for tweet in api_db.col_tweets.find({"created_at": {"$gte": _from, "$lt": _to}}, no_cursor_timeout=True):
    print(tweet)
    break

{'_id': 1296716396240543744, 'created_at': datetime.datetime(2020, 8, 21, 7, 50, 55), 'favorite_count': 22, 'full_text': 'Estatuto do Antigo Combatente já foi publicado. Entra em vigor a 1 de setembro. https://t.co/xjNtyPhLgk', 'lang': 'pt', 'quoted_status': {'created_at': 'Fri Aug 21 07:47:27 +0000 2020', 'favorite_count': 16, 'full_text': 'Cá está a confirmação de um momento histórico para os ex-Combatentes, e, sobretudo, para Portugal que assume a sua responsabilidade histórica.\nhttps://t.co/HDp8dpg3q9', 'lang': 'pt', 'retweet_count': 3, 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'urls': [{'expanded_url': 'http://pt.cision.com/files/press/2020/08/88171733.pdf', 'url': 'https://t.co/HDp8dpg3q9'}], 'user': 1690036087, '_id': 1296715526245421056, 'original': True}, 'retweet_count': 7, 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'urls': [{'expanded_url': 'https://twitter.com/JoaoCra

In [None]:
print("DONE")