In [1]:
import hashlib
import os

import pandas as pd
from tqdm import tqdm

from common.app import App
from common.database import Database
from common.helpers import Helpers

app_run = App(debug=True)
db = Database("tweets.db", app=app_run)

In [2]:
filename = "COVID_Full_1_2.xlsx"
file_path = os.path.join(app_run.root_dir, "src", "resources", "data", filename)

xls = pd.read_excel(file_path)

In [3]:
# Remove empty columns
xls = xls.loc[:, ~xls.columns.str.startswith("Unnamed:")]

# Change type
int_cols = ["retweets", "favorites", "topic", "subcat", "position", "frame"]
xls[int_cols] = xls[int_cols].astype("Int32")

# Add theme_hardcoded if it does not exist
if "theme_hardcoded" not in xls.columns:
    xls["theme_hardcoded"] = None

# Add tweet_id and covid_theme columns
xls["covid_theme"] = 1
xls["tweet_id"] = None

# Extract ids
xls["tweet_id"] = xls["URL"].apply(Helpers.extract_id)

# If tweet_id==0, then it's na
# hash the tweet with the date, oldText and text
# and use it as id
# mask = xls["tweet_id"] == 0

# xls.loc[mask, ["tweet_id"]] = (
#     xls[mask]["created_at"].astype(str)
#     + xls[mask]["oldText"].astype(str)
#     + xls[mask]["text"].astype(str)
# )
# xls.loc[mask, ["tweet_id"]] = xls["tweet_id"].apply(
#     lambda x: str(int(hashlib.sha1(bytes(x, "utf-8")).hexdigest(), 16))[:10]
# )

# Reorder columns
cols = xls.columns.tolist()
cols.remove("covid_theme")
cols.remove("tweet_id")
cols.insert(0, "covid_theme")
cols.insert(0, "tweet_id")
xls = xls[cols]

# At first, only select unproblematic tweets (tweet_id defined)
xls_noprob = xls[xls["tweet_id"] != 0]
xls_prob = xls[xls["tweet_id"] == 0]

In [4]:
cols_update = ["tweet_id", "topic", "subcat", "position", "frame"]
to_update = xls_noprob[cols_update]

# Reorder cols
cols = to_update.columns.tolist()
cols.remove("tweet_id")
cols.insert(len(cols_update), "tweet_id")
to_update = to_update[cols]
print(to_update)

tweet_entries = [list(entry) for entry in to_update.to_numpy()]

       topic  subcat  position  frame             tweet_id
0        601   60103         0      6  1216694717288632320
1        601   60103         1      3  1220400750951653378
2        601   60103         0      1  1220431393139937280
3        601   60103         0      1  1220705634074624000
4        601   60105         1      1  1220999781717364736
...      ...     ...       ...    ...                  ...
69724    603    <NA>      <NA>   <NA>  1377163599794171905
69725    603    <NA>      <NA>   <NA>  1377294162106712072
69726    602    <NA>      <NA>   <NA>  1377321624786046977
69727    603    <NA>      <NA>   <NA>  1377333267800686594
69728    602    <NA>      <NA>   <NA>  1377338219398791168

[66153 rows x 5 columns]


In [5]:
# Convert "<NA>" to None
for tweet in tqdm(tweet_entries):
    for i, item in enumerate(tweet):
        if str(item) == "<NA>":
            tweet[i] = None
tweet_entries[-1]

100%|██████████| 66153/66153 [00:00<00:00, 716392.02it/s]


[602, None, None, None, '1377338219398791168']

In [14]:
cols_update.remove("tweet_id") if "tweet_id" in cols_update else cols_update
cols_update

['topic', 'subcat', 'position', 'frame']

In [16]:
# Insert tweets
with db:
    db.update_many(cols_update, "tweet_id", tweet_entries)

In [6]:
# Now problematic tweets!
xls_prob

Unnamed: 0,tweet_id,covid_theme,created_at,handle,name,old_text,text,URL,type,retweets,favorites,topic,subcat,position,frame,theme_hardcoded
36,0,1,01/02/2020,@MinSoliSante,Ministère des Solidarités et de la Santé,[#CORONAVIRUSFRANCE] Que faire si vous revenez...,[#CORONAVIRUSFRANCE] Que faire si vous revenez...,,New,,,601,60103,0,3,
40,0,1,02/02/2020,@MinSoliSante,Ministère des Solidarités et de la Santé,[#CORONAVIRUSFRANCE] Vous vous rendez dans une...,[#CORONAVIRUSFRANCE] Vous vous rendez dans une...,,New,,,601,60103,0,1,
74,0,1,08/02/2020,@UN,United Nations,"The UN works for everyone, everywhere by 🇺🇳 pr...","The UN works for everyone, everywhere by 🇺🇳 pr...",,New,,,601,60109,0,6,
81,0,1,10/02/2020,@MinSoliSante,Ministère des Solidarités et de la Santé,Vous revenez d’une zone où circule le #coronav...,Vous revenez d’une zone où circule le #coronav...,,New,,,601,60103,0,3,
98,0,1,18/02/2020,@MinSoliSante,Ministère des Solidarités et de la Santé,[#CORONAVIRUS] Que faire si vous revenez d’une...,[#CORONAVIRUS] Que faire si vous revenez d’une...,,New,,,601,60103,0,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68376,0,1,09/04/2020,@WHO,World Health Organization (WHO),RT @WHOPhilippines: Ikaw at ang iyong mahal sa...,WHOPhilippines: Ikaw at ang iyong mahal sa buh...,0,Retweet,,,608,,,,
68391,0,1,07/06/2020,@WHO,World Health Organization (WHO),RT @WHOIndonesia: Selamat #HariPanganSedunia #...,WHOIndonesia: Selamat #HariPanganSedunia #Worl...,,Retweet,,,608,,,,
68392,0,1,08/06/2020,@WHO,World Health Organization (WHO),RT @WHO: विश्व स्वास्थय संगठन (WHO) के #COVID1...,WHO: विश्व स्वास्थय संगठन (WHO) के #COVID19 प्...,,Retweet,,,608,,,,
68393,0,1,08/06/2020,@WHO,World Health Organization (WHO),RT @WHO: 用你的语言观看世卫组织新闻发布会，了解全球应对COVID-19最新动态。 ...,WHO: 用你的语言观看世卫组织新闻发布会，了解全球应对COVID-19最新动态。 每周一、...,,Retweet,,,608,,,,


In [None]:
def preprocess(txt: str):
    """
    Sanitize a string for the specific needs of this insertor.
    """

    if txt is None:
        return None

    # Replace and format values
    # regex = r"(\xa0\w{3}\s\d{2}.\s\d{4}\xa0)"  #  "\xa0Mar 03, 2020\xa0"
    # txt = re.sub(regex, "", txt, 1)
    # txt = unicodedata.normalize("NFKD", txt)
    # txt = unescape(txt)  # for & and >
    txt = txt.lower().replace("\n", "").replace(" ", "").replace("’", "'")
    # txt = txt.replace("&amp;", "&")

    # return txt[:100]
    # return txt[:140]
    return txt

In [None]:
# Remember to classify the database if needed