In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import hashlib
import os
import unicodedata
import re
from html import unescape

import pandas as pd
from tqdm import tqdm
from thefuzz import fuzz, process

from common.app import App
from common.database import Database
from common.helpers import Helpers

app_run = App(debug=True)
db = Database("tweets.db", app=app_run)

In [3]:
filename = "COVID_Full_1_2.xlsx"
file_path = os.path.join(app_run.root_dir, "src", "resources", "data", filename)
pkl_xls = "COVID_Full_1_2.pkl"
pkl_path = os.path.join(app_run.root_dir, "src", "resources", "data", pkl_xls)


In [4]:
xls = pd.read_excel(file_path)
xls.to_pickle(pkl_path)

In [4]:
xls = pd.read_pickle(pkl_path)

In [8]:
# Remove empty columns
xls = xls.loc[:, ~xls.columns.str.startswith("Unnamed:")]

# Change type
int_cols = ["retweets", "favorites", "topic", "subcat", "position", "frame"]
xls[int_cols] = xls[int_cols].astype("Int32")

# Add theme_hardcoded if it does not exist
if "theme_hardcoded" not in xls.columns:
    xls["theme_hardcoded"] = None

# Add tweet_id and covid_theme columns
xls["covid_theme"] = 1
xls["tweet_id"] = None

# Extract ids
xls["tweet_id"] = xls["URL"].apply(Helpers.extract_id)

# If tweet_id==0, then it's na
# hash the tweet with the date, oldText and text
# and use it as id
# mask = xls["tweet_id"] == 0

# xls.loc[mask, ["tweet_id"]] = (
#     xls[mask]["created_at"].astype(str)
#     + xls[mask]["oldText"].astype(str)
#     + xls[mask]["text"].astype(str)
# )
# xls.loc[mask, ["tweet_id"]] = xls["tweet_id"].apply(
#     lambda x: str(int(hashlib.sha1(bytes(x, "utf-8")).hexdigest(), 16))[:10]
# )

# Reorder columns
cols = xls.columns.tolist()
cols.remove("covid_theme")
cols.remove("tweet_id")
cols.insert(0, "covid_theme")
cols.insert(0, "tweet_id")
xls = xls[cols]

# At first, only select unproblematic tweets (tweet_id defined)
xls_noprob = xls[xls["tweet_id"] != 0]
xls_prob = xls[xls["tweet_id"] == 0]

In [9]:
def na_to_none(entries):
    """
    Takes a list of entries, loop over all all item of all entries
    and convert <NA> and nan to None
    """
    
    for tweet in tqdm(entries):
        for i, item in enumerate(tweet):
            if str(item) in ["<NA>", "nan"]:
                tweet[i] = None

In [10]:
cols_update = ["tweet_id", "topic", "subcat", "position", "frame"]
to_update = xls_noprob[cols_update]

# Reorder cols
cols = to_update.columns.tolist()
cols.remove("tweet_id")
cols.insert(len(cols_update), "tweet_id")
to_update = to_update[cols]
print(to_update)

tweet_entries = [list(entry) for entry in to_update.to_numpy()]

       topic  subcat  position  frame             tweet_id
0        601   60103         0      6  1216694717288632320
1        601   60103         1      3  1220400750951653378
2        601   60103         0      1  1220431393139937280
3        601   60103         0      1  1220705634074624000
4        601   60105         1      1  1220999781717364736
...      ...     ...       ...    ...                  ...
69724    603    <NA>      <NA>   <NA>  1377163599794171905
69725    603    <NA>      <NA>   <NA>  1377294162106712072
69726    602    <NA>      <NA>   <NA>  1377321624786046977
69727    603    <NA>      <NA>   <NA>  1377333267800686594
69728    602    <NA>      <NA>   <NA>  1377338219398791168

[66153 rows x 5 columns]


In [11]:
na_to_none(tweet_entries)
tweet_entries[-5:]

100%|██████████| 66153/66153 [00:00<00:00, 374969.48it/s]


[[603, None, None, None, '1377163599794171905'],
 [603, None, None, None, '1377294162106712072'],
 [602, None, None, None, '1377321624786046977'],
 [603, None, None, None, '1377333267800686594'],
 [602, None, None, None, '1377338219398791168']]

In [12]:
cols_update.remove("tweet_id") if "tweet_id" in cols_update else cols_update
cols_update

['topic', 'subcat', 'position', 'frame']

In [13]:
# Insert tweets
# with db:
    # db.update_many(cols_update, "tweet_id", tweet_entries)

## Problematic tweets

In [14]:
with db:
    all_tws = db.get_all_tweets()
all_tws = Helpers.df_from_db(all_tws)

In [15]:
print(f"{len(xls_prob)=}")
print(f"{len(all_tws)=}")

len(xls_prob)=3576
len(all_tws)=240375


In [18]:
# Only keep problematic tweets from db
all_tws = all_tws[all_tws["handle"].isin(['@MinSoliSante', '@UN', '@DrTedros', '@WHO', '@enmarchefr', '@FCDOGovUK'])]
print(f"{len(all_tws)=}")

len(all_tws)=48910


In [44]:
xls_li = xls_prob.values.tolist()

In [20]:
all_tws = all_tws.values.tolist()

In [21]:
def preprocess(txt: str):
    """
    Sanitize a string for the specific needs of this insertor.
    """

    if txt is None or isinstance(txt, float):
        return None
    
    txt = txt[:140]
        
    # Replace and format values
    regex = r"(\xa0\w{3}\s\d{2}.\s\d{4}\xa0)"  #  "\xa0Mar 03, 2020\xa0"
    txt = re.sub(regex, "", txt, 1)
    txt = unicodedata.normalize("NFKD", txt)
    txt = unescape(txt)  # for & and >
    txt = txt.replace("\n", "").replace(" ", "").replace("’", "'")
    txt = txt.replace("&amp;", "&")

    # return txt[:100]
    # return txt[:140]
    return txt

In [22]:
for tw in all_tws:
    if tw[0] == "1297392008":
        print(tw)
        print(len(tw))
        print(tw[5])
        print(tw[6])

        print(preprocess(tw[5]) == preprocess(xls_li[0][6]))

['1297392008', 1, '01/02/2020', '@MinSoliSante', 'Ministère des Solidarités et de la Santé', '[#CORONAVIRUSFRANCE] Que faire si vous revenez d’une zone à risque ? Retrouvez l’ensemble des informations et recommandations sanitaires sur 👉\xa0https://t.co/SeQSsKcgnC\xa0Feb 01, 2020\xa0', None, None, 'New', nan, nan, 601, 60103.0, 0, 3, None]
16
[#CORONAVIRUSFRANCE] Que faire si vous revenez d’une zone à risque ? Retrouvez l’ensemble des informations et recommandations sanitaires sur 👉 https://t.co/SeQSsKcgnC Feb 01, 2020 
None
True


In [23]:
xls_li_prep = xls_li.copy()
for tw in tqdm(xls_li_prep):
    tw[5], tw[6] = preprocess(tw[5]), preprocess(tw[6])

100%|██████████| 3576/3576 [00:00<00:00, 50711.47it/s]


In [24]:
all_tws_prep = all_tws.copy()
for tw in tqdm(all_tws_prep):
    tw[5], tw[6] = preprocess(tw[5]), preprocess(tw[6])

100%|██████████| 48910/48910 [00:00<00:00, 67721.33it/s]


In [46]:
found_count = 0
for i, tw_db in tqdm(enumerate(all_tws_prep), total=len(all_tws_prep)):
    # For every tweet in the database
    
    db_old_text = tw_db[5]
    db_text = tw_db[6]
    # print(f"{db_old_text=}")
    # print(f"{db_text=}")

    for j, tw_prob in enumerate(xls_li_prep):
        # Look if there is a matching problematic tweet
        old_text = tw_prob[5]
        text = tw_prob[6] 

        # print(f"{db_old_text=}")
        # print(f"{db_text=}")
        # print(f"{old_text=}")
        # print(f"{text=}")
        # print(f"{new_topic=}")
        # print(f"{new_subcat=}")
        # print(f"{new_pos=}")
        # print(f"{new_frame=}")
        # break

        # if db_text in (old_text, text):
        if db_text == text:
            found_count += 1

            new_topic = tw_prob[11]
            new_subcat = tw_prob[12]
            new_pos = tw_prob[13]
            new_frame = tw_prob[14]

            all_tws[i][11] = new_topic
            all_tws[i][12] = new_subcat 
            all_tws[i][13] = new_pos
            all_tws[i][14] = new_frame 

            if xls_li_prep[j][-1] != "updated" and xls_li[j][-1] != "updated":
                xls_li_prep[j].append("updated")  # keep trace of which one was updated
                xls_li[j].append("updated") 
            break
    # if i == 200:
    #     break
print(found_count)
# print(all_tws[197])
# print(all_tws_prep[197])
        

100%|██████████| 48910/48910 [01:03<00:00, 773.77it/s]

2433





In [67]:
def updated_count(l: list, term="updated"):
    """
    Returns the number of term tweets from list.
    """
    
    return sum(tw[-1] == term for tw in l)

In [48]:
updated_count(xls_li_prep)
updated_count(xls_li)

0

In [49]:
df_all_tws = Helpers.df_from_db(all_tws)
df_all_tws.head(2)

Unnamed: 0,tweet_id,covid_theme,created_at,handle,name,old_text,text,url,type,retweets,favorites,topic,subcat,position,frame,theme_hardcoded
0,1237326281345204226,0,10/03/2020 10:35:48,@FCDOGovUK,"Foreign, Commonwealth & Development Office",RT@tradegovuk:AUK-UStradedealwillleveluptheUKb...,AUK-UStradedealwillleveluptheUKbyhelpingbusine...,https://twitter.com/FCDOGovUK/status/123732628...,Retweet,62.0,0.0,,,,,
1,1237329006699130881,0,10/03/2020 10:46:38,@FCDOGovUK,"Foreign, Commonwealth & Development Office",RT@morton_wendy:Pleasuretotakepartintoday's@Be...,Pleasuretotakepartintoday's@BerlinProcess#Thin...,https://twitter.com/FCDOGovUK/status/123732900...,Retweet,20.0,0.0,,,,,


In [50]:
cols_update = ["tweet_id", "topic", "subcat", "position", "frame"]
to_update = df_all_tws[cols_update]

# Reorder cols
cols = to_update.columns.tolist()
cols.remove("tweet_id")
cols.insert(len(cols_update), "tweet_id")
to_update = to_update[cols]
print(to_update)

tweet_entries = [list(entry) for entry in to_update.to_numpy()]

      topic subcat position frame             tweet_id
0      None    NaN     None  None  1237326281345204226
1      None    NaN     None  None  1237329006699130881
2      None    NaN     None  None  1237356860556091392
3      None    NaN     None  None  1237402500447207430
4      None    NaN     None  None  1237416449444110336
...     ...    ...      ...   ...                  ...
48905  None    NaN     None  None  1211948316570849281
48906  None    NaN     None  None  1211947093235044353
48907  None    NaN     None  None  1211946511627669505
48908  None    NaN     None  None  1211803903194480640
48909  None    NaN     None  None  1211803656628195328

[48910 rows x 5 columns]


In [51]:
na_to_none(tweet_entries)
tweet_entries[:5]

100%|██████████| 48910/48910 [00:00<00:00, 384540.31it/s]


[[None, None, None, None, '1237326281345204226'],
 [None, None, None, None, '1237329006699130881'],
 [None, None, None, None, '1237356860556091392'],
 [None, None, None, None, '1237402500447207430'],
 [None, None, None, None, '1237416449444110336']]

In [52]:
cols_update.remove("tweet_id") if "tweet_id" in cols_update else cols_update
cols_update

['topic', 'subcat', 'position', 'frame']

In [35]:
# Update tweets
with db:
    updated = db.update_many(cols_update, "tweet_id", tweet_entries)
print(updated)

48910


## Problematic of problematic

In [61]:
tws_prob_prep = [tw for tw in xls_li_prep if tw[-1] != "updated"]
tws_prob = [tw for tw in xls_li if tw[-1] != "updated"]

In [56]:
updated_count(tws_prob) == updated_count(tws_prob_prep)

True

In [75]:
found_count = 0
for i, tw_db in tqdm(enumerate(all_tws_prep), total=len(all_tws_prep)):
    # For every tweet in the database
    
    db_old_text = tw_db[5]
    db_text = tw_db[6]

    # print(f"{db_old_text=}")
    # print(f"{db_text=}")

    for j, tw_prob in enumerate(tws_prob_prep):
        # Look if there is a matching problematic tweet
        old_text = tw_prob[5]
        # old_text = old_text if old_text is not None else ""
        text = tw_prob[6] 
        # text = text if text is not None else ""

        # print(f"{db_old_text=}")
        # print(f"{db_text=}")
        # print(f"{old_text=}")
        # print(f"{text=}")
        # print(f"{new_topic=}")
        # print(f"{new_subcat=}")
        # print(f"{new_pos=}")
        # print(f"{new_frame=}")
        # break

        # if db_text in (old_text, text):
        if db_text == text or db_old_text == old_text:
            found_count += 1

            new_topic = tw_prob[11]
            new_subcat = tw_prob[12]
            new_pos = tw_prob[13]
            new_frame = tw_prob[14]
            # print(tw_db)
            # print(tw_prob) 
            # print(i)
            all_tws[i][11] = new_topic
            all_tws[i][12] = new_subcat 
            all_tws[i][13] = new_pos
            all_tws[i][14] = new_frame 

            if tws_prob_prep[j][-1] != "updated2":
                tws_prob_prep[j].append("updated2")  # keep trace of which one was updated
            break
print(found_count)
# print(all_tws[197])
# print(all_tws_prep[197])
        

100%|██████████| 48910/48910 [00:27<00:00, 1748.02it/s]

15050





In [77]:
# Update in DB

df_all_tws = Helpers.df_from_db(all_tws)
df_all_tws.head(2)

cols_update = ["tweet_id", "topic", "subcat", "position", "frame"]
to_update = df_all_tws[cols_update]

# Reorder cols
cols = to_update.columns.tolist()
cols.remove("tweet_id")
cols.insert(len(cols_update), "tweet_id")
to_update = to_update[cols]

tweet_entries = [list(entry) for entry in to_update.to_numpy()]
na_to_none(tweet_entries)

cols_update.remove("tweet_id") if "tweet_id" in cols_update else cols_update
tweet_entries[:5]

100%|██████████| 48910/48910 [00:00<00:00, 363435.61it/s]


[[None, None, None, None, '1237326281345204226'],
 [None, None, None, None, '1237329006699130881'],
 [602.0, None, None, None, '1237356860556091392'],
 [None, None, None, None, '1237402500447207430'],
 [None, None, None, None, '1237416449444110336']]

In [78]:
# Update tweets
with db:
    updated = db.update_many(cols_update, "tweet_id", tweet_entries)
print(updated)

48910


## Still a problem

In [80]:
# print(tws_prob_prep[:5])
print("len tws_probs", len(tws_prob_prep))
print("count updated", updated_count(tws_prob_prep, "updated2"))

len tws_probs 1392
count updated 1216


In [70]:
tws_still_prob = [tw for tw in tws_prob_prep if tw[-1] != "updated2"]
print(tws_still_prob[:2])
print(len(tws_still_prob))

[[0, 1, '21/02/2020', '@MinSoliSante', 'Ministère des Solidarités et de la Santé', "[#CORONAVIRUS]Quefairesivousrevenezd'unezoneàrisque?Retrouvezl'ensembledesinformationsetrecommandationssanitairessurle#c", "[#CORONAVIRUS]Quefairesivousrevenezd'unezoneàrisque?Retrouvezl'ensembledesinformationsetrecommandationssanitairessurle#c", nan, 'New', <NA>, <NA>, 601, 60103, 0, 3, None], [0, 1, '15/02/2020', '@MinSoliSante', 'Ministère des Solidarités et de la Santé', "[#CORONAVIRUSFRANCE]Évolutiondelasituation,réponsesàvosinterrogationsetconseils...Retrouvezl'ensembledel'informationsanitaires", "[#CORONAVIRUSFRANCE]Évolutiondelasituation,réponsesàvosinterrogationsetconseils...Retrouvezl'ensembledel'informationsanitaires", nan, 'New', <NA>, <NA>, 602, <NA>, <NA>, <NA>, None]]
176


In [91]:
for i, tw in enumerate(tws_still_prob, start=1):
    print(i, "/", len(tws_still_prob))
    print("created_at:", tw[2])
    print("handle:", tw[3])
    print("text:", tw[6])
    print(tw[11:])
    print("\n")

# Corrected by hand!

1 / 176
created_at: 21/02/2020
handle: @MinSoliSante
text: [#CORONAVIRUS]Quefairesivousrevenezd'unezoneàrisque?Retrouvezl'ensembledesinformationsetrecommandationssanitairessurle#c
[601, 60103, 0, 3, None]


2 / 176
created_at: 15/02/2020
handle: @MinSoliSante
text: [#CORONAVIRUSFRANCE]Évolutiondelasituation,réponsesàvosinterrogationsetconseils...Retrouvezl'ensembledel'informationsanitaires
[602, <NA>, <NA>, <NA>, None]


3 / 176
created_at: 22/02/2020
handle: @MinSoliSante
text: ☎️Unnumérogratuitd'informationsurle#Coronavirus#COVID19estdisponible7jourssur7de9h00à19h00.Appelezle0800130000(appel
[602, <NA>, <NA>, <NA>, None]


4 / 176
created_at: 23/02/2020
handle: @MinSoliSante
text: ☎️Unnumérogratuitd'informationsurle#Coronavirus#COVID19estdisponible7jourssur7de9h00à19h00.Appelezle0800130000(appel
[602, <NA>, <NA>, <NA>, None]


5 / 176
created_at: 28/02/2020
handle: @MinSoliSante
text: [#CORONAVIRUS]Évolutiondelasituation,réponsesàvosinterrogationsetconseils...Retrouvezl'en

In [None]:
# Remember to classify the database if needed