In [7]:
"""
Check if created_at field in database matches what is returned from the Twitter API.

See descriptives/eda.ipynb (1.1) and descriptives/figures/july_23.png for more information
"""

import sys, os

sys.path.append(os.path.abspath(os.path.join("..", "src")))

%load_ext autoreload
%autoreload 2
import re
import time

import pandas as pd
import numpy as np
from tqdm import tqdm

tqdm.pandas()

from common.app import App
from common.database import Database
from common.api import Api
from common.helpers import Helpers

app_run = App(debug=False)
db = Database("tweets.db", app=app_run)
api = Api()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
with db:
    tws = db.get_all_tweets()
print(f"{len(tws)=}")
df_all = Helpers.df_from_db(tws)

len(tws)=238523


In [32]:
# Convert date
df_all["date"] = df_all["created_at"].apply(Helpers.convert_date)
df_all["date"] = pd.to_datetime(df_all["date"], format="%d/%m/%Y")

# At first, only take the ones from July 23
df = df_all.loc[df_all["date"] == pd.to_datetime("2020-07-23")]
print(f"{len(df)=}")
print("Date of subset", df["date"].unique())

len(df)=1738
Date of subset ['2020-07-23T00:00:00.000000000']


In [17]:
# Get ids of tweets
tws_idx = df["tweet_id"].values.tolist()
print(len(tws_idx))

1738


In [18]:
# Retrieve updated tweets
tws = api.get_tweets_by_ids(tws_idx)
print(f"{len(tws)=}")

Completing tweets..


  0%|          | 0/1738 [00:00<?, ?it/s]

Starting loop


1800it [00:24, 73.02it/s]

len(tws)=1738





In [25]:
tws.loc[tws["created_at"].isna()]
# 1293607391737663493 deleted tweet
# 1286210146167861249 deleted tweet

Unnamed: 0,tweet_id,covid_theme,created_at,handle,name,old_text,text,type,retweets,favorites,topic,subcat,position,frame,theme_hardcoded,url
717,1293607391737663493,,,,,,,,,,,,,,,
1513,1286210146167861249,,,,,,,,,,,,,,,


In [28]:
# Drop two deleted tweets
tws = tws.drop([717, 1513], axis=0)

In [30]:
# Convert new created_at field 
tws["date"] = tws["created_at"].apply(Helpers.convert_date)
tws["date"] = pd.to_datetime(tws["date"], format="%d/%m/%Y")

In [34]:
# We no longer have only July 23
tws["created_at"].unique()

array(['23/07/2020 05:45:51', '23/07/2020 07:00:00',
       '23/07/2020 07:37:59', ..., '10/12/2020 12:38:34',
       '16/03/2020 20:35:50', '02/06/2020 08:29:00'], dtype=object)

In [36]:
len(tws)

1736

In [38]:
tws.values.tolist()[0]

[1286175675540557824,
 None,
 '23/07/2020 05:45:51',
 '@Mitte_Centre',
 'Die Mitte – Le Centre',
 'RT @paganini_nr: Für mich ist klar: Die #Kündigungsinitiative steht für einen Alleingang der Schweiz in Bildung, Wirtschaft &amp; Forschung. Du…',
 'Für mich ist klar: Die #Kündigungsinitiative steht für einen Alleingang der Schweiz in Bildung, Wirtschaft &amp; Forschung. Durch die Annahme der Initiative isolieren wir die Schweiz. Das müssen wir verhindern! @kuendigung_nein #abst20\nhttps://t.co/bm4gnLnRog',
 'Retweet',
 8,
 0,
 None,
 None,
 None,
 None,
 None,
 'https://twitter.com/Mitte_Centre/status/1286175675540557824',
 Timestamp('2020-07-23 00:00:00')]

In [46]:
# Update in db
to_update = [
    (tw[2], tw[0]) for tw in tws.values.tolist()
]
print(f"{len(to_update)=}")

with db:
    updated = db.update_many("created_at", "tweet_id", to_update)
print(f"{updated} tweets updated")

len(to_update)=1736
1736 tweets updated


In [47]:
# We had 1738 tweets for July 23
with db:
    tws = db.get_all_tweets()
print(f"{len(tws)=}")
df_all = Helpers.df_from_db(tws)

# Convert date
df_all["date"] = df_all["created_at"].apply(Helpers.convert_date)
df_all["date"] = pd.to_datetime(df_all["date"], format="%d/%m/%Y")

# At first, only take the ones from July 23
df = df_all.loc[df_all["date"] == pd.to_datetime("2020-07-23")]
print(f"{len(df)=}")
# And now only 434!
# That's normal

len(tws)=238523
len(df)=434
