In [1]:
"""
Extract mentions from tweets and reshape into long format instead of wide
"""

import sys, os

sys.path.append(os.path.abspath(os.path.join("..", "src")))

%load_ext autoreload
%autoreload 2
import re
import time

import pandas as pd
import numpy as np
from tqdm import tqdm

tqdm.pandas()

from common.database import Database
from common.app import App
from common.helpers import Helpers

app_run = App(debug=True)
db = Database("tweets.db", app=app_run)

In [2]:
with db:
    tws = db.get_all_tweets()
print(len(tws))
df_all = Helpers.df_from_db(tws)

238523


In [4]:
# Convert date
df_all["date"] = df_all["created_at"].apply(Helpers.convert_date)
df_all["date"] = pd.to_datetime(df_all["date"], format="%d/%m/%Y")

# Sort df
df_all_sorted = Helpers.sort_timerange(df_all)
print(f"{len(df_all_sorted)=}") 

len(df_all_sorted)=185749


In [5]:
def extract_mentions(txt):
    """
    Extract the twitter mentions of a string

    Returns the number of unqiue mentions and the unique mentions
    """

    if not txt:
        return

    r1 = r"@[A-Za-z0-9_-]*"
    mentions = set(re.findall(r1, txt))
    
    return len(mentions), mentions

In [6]:
# Testing
txt = "@manue123 ahahh @olive test @opotrac shouldnot be @potrac counted @potrac @3t6___adsasd @wil-bail"
extract_mentions(txt)

(6,
 {'@3t6___adsasd', '@manue123', '@olive', '@opotrac', '@potrac', '@wil-bail'})

In [7]:
# Get maximum number of mentions to know how many columns needed
max_old = df_all_sorted["old_text"].str.count(r"@\w+").max()  # 22
max_txt = df_all_sorted["text"].str.count(r"@\w+").max()  # 26
max(max_old, max_txt)

26.0

In [8]:
# Create the new columns
df = df_all_sorted.assign(**{f"target_{i:02}": np.nan for i in range(1, 27)})

In [9]:
def insert_mentions(row):
    old = row["old_text"] if row["old_text"] is not None else " "
    cur = row["text"] if row["text"] is not None else " "
    handle = row["handle"]

    txt = old + " " + cur
    _, mentions = extract_mentions(txt)

    if handle in mentions:
        mentions.remove(handle)

    for i, m in enumerate(mentions, start=1):
        row.loc[f"target_{i:02}"] = m
    return row

In [10]:
df = df.progress_apply(insert_mentions, axis=1)

100%|████████████████████████████████████| 185749/185749 [00:53<00:00, 3487.71it/s]


In [11]:
df.sample(5).loc[:, "target_01":"target_27"]

Unnamed: 0,target_01,target_02,target_03,target_04,target_05,target_06,target_07,target_08,target_09,target_10,...,target_18,target_19,target_20,target_21,target_22,target_23,target_24,target_25,target_26,target_27
113068,@NAM_ChairAZ,@DrTedros,,,,,,,,,...,,,,,,,,,,
51765,@JosepBorrellF,@AymanHsafadi,@AranchaGlezLaya,@UfmSecretariat,,,,,,,...,,,,,,,,,,
33345,,,,,,,,,,,...,,,,,,,,,,
598,@PSSuisse,,,,,,,,,,...,,,,,,,,,,
16630,@EASO,@NinaGregoriEASO,@ICMPD,,,,,,,,...,,,,,,,,,,


In [12]:
df.loc[:, "target_01":"target_27"].count()

target_01    130809
target_02     49475
target_03     19598
target_04      8750
target_05      4559
target_06      2724
target_07      1683
target_08      1038
target_09       673
target_10       459
target_11       286
target_12       174
target_13        91
target_14        69
target_15        32
target_16        23
target_17        13
target_18        12
target_19        11
target_20         8
target_21         6
target_22         4
target_23         3
target_24         2
target_25         2
target_26         2
target_27         1
dtype: int64

In [14]:
# Intermediary step, export to pickle
df.to_pickle("interactive/data/pkl/df_mentions_extracted.pkl")

In [None]:
# If needed, load pickle
#df = pd.read_pickle("interactive/data/pkl/df_mentions_extracted.pkl")

In [15]:
long_df = pd.wide_to_long(df, "target", i="tweet_id", j="target_id", sep="_").dropna(subset=["target"])

In [17]:
long_df.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,favorites,old_text,position,retweets,handle,topic,date,type,covid_theme,frame,name,subcat,text,theme_hardcoded,url,created_at,target
tweet_id,target_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1255032256747446272,1,0.0,RT @migrpolcentre: 'The effects of #COVID19 on...,,33.0,@EUHomeAffairs,605.0,2020-04-28,Retweet,1,,EUHomeAffairs,,'The effects of #COVID19 on #Mobility and #Cit...,,https://twitter.com/EUHomeAffairs/status/12550...,28/04/2020 07:13:02,@migrpolcentre
1251500469312278535,3,,RT @DrTedros: Looking forward to the One World...,,,@WHO,605.0,2020-04-18,Retweet,1,,World Health Organization (WHO),,RT @DrTedros: Looking forward to the One World...,,https://twitter.com/WHO/status/125150046931227...,18/04/2020 13:18:58,@StephenAtHome
1214136809556455425,1,822.0,,,119.0,@Conservatives,,2020-01-06,New,0,,Conservatives,,January 6th: 🙋‍♀️🔵🇬🇧\n\nIntroducing @Miriam_Ca...,,https://twitter.com/Conservatives/status/12141...,06/01/2020 10:49:07,@Miriam_Cates
1314206955750137857,2,0.0,RT @BCWBrussels: Ranking 13th place in the #In...,,1.0,@GreensEFA,,2020-10-08,Retweet,0,,Greens/EFA in the EU Parliament 🌍,,Ranking 13th place in the #InfluenceIndex...\n...,,https://twitter.com/GreensEFA/status/131420695...,08/10/2020 14:12:09,@BCWBrussels
1272244325787852802,1,0.0,RT @sarahelhairy: « La #République n’effacera ...,,35.0,@MoDem,,2020-06-14,Retweet,0,,MoDem,,"« La #République n’effacera aucune trace, ni a...",0.0,https://twitter.com/MoDem/status/1272244325787...,14/06/2020 19:07:39,@sarahelhairy


In [45]:
# Export to pickle
long_df.to_pickle("interactive/data/pkl/long_df.pkl")

In [6]:
# Load from pickle if necessary
long_df = pd.read_pickle("interactive/data/pkl/long_df.pkl")

In [22]:
"""
Categorization of tweets
See common/helpers.py

tweets about covid: 
    (topic in topics_cov)  -> 1.1
    OR {(covid_theme == 1) 
        BUT NOT IF ((topic in topics_not_cov) OR (theme_hardcoded == 0))}  -> 1.2 
    OR ((topic is None) AND (covid_theme == 1) AND (theme_hardcoded is None))  -> 1.3

tweets not about covid:
    (topic == 608)  -> 2.1
    OR ((theme_hardcoded == 0) BUT NOT IF (topic in topics_cov)) -> 2.2
    OR ((covid_theme == 0) 
        BUT NOT IF (topic in topics_cov))  -> 2.3

Explanation
tweets about covid: 
1.1 Coded tweets (601 to 607)
1.2 Tweets automatically classified as being about covid (covid_theme=1). From those, do not consider the ones coded as 608 or manually excluded (theme_hardcoded=0)
1.3 Tweets about covid that are still not coded

tweets not about covid:
2.1 Tweets coded as 608
2.2 Tweets manually excluded. From those, do not consider the ones with topic different from 608
2.3 Tweets automatically classified as being not about covid (covid_theme=0). From those, do not consider tweets that have been coded
"""

'\nCategorization of tweets\n\ntweets about covid: \n    (topic in topics_cov)  -> 1.1\n    OR {(covid_theme == 1) \n        BUT NOT IF ((topic in topics_not_cov) OR (theme_hardcoded == 0))}  -> 1.2 \n    OR ((topic is None) AND (covid_theme == 1) AND (theme_hardcoded is None))  -> 1.3\n\ntweets not about covid:\n    (topic == 608)  -> 2.1\n    OR ((theme_hardcoded == 0) BUT NOT IF (topic in topics_cov)) -> 2.2\n    OR ((covid_theme == 0) \n        BUT NOT IF (topic in topics_cov))  -> 2.3\n\nExplanation.\n1.1 Simple coded tweets\n1.2 Tweets automatically classified as being about covid (covid_theme=1). From those, do not consider the ones coded as 608 or manually excluded (theme_hardcoded=0)\n1.3 Tweets about covid that are still not coded\n\n2.1 Tweets coded as 608\n2.2 Tweets manually excluded. From those, do not consider the ones with topic different from 608\n2.3 Tweets automatically classified as being not about covid (covid_theme=0). From those, do not consider tweets that have 

In [12]:
long_df_covid = Helpers.categorize_df_covid(long_df)
long_df_not_covid = Helpers.categorize_df_not_covid(long_df)

In [13]:
# Sanity check
print("Sanity check long format\n")
print("Tweets about covid:")
print(f"{len(long_df_covid)=}")
print(f"Topics: {long_df_covid['topic'].unique()} \n")

print("Tweets NOT about covid:")
print(f"{len(long_df_not_covid)=}")
print(f"Topics: {long_df_not_covid['topic'].unique()}")

print(f"{len(long_df)=}")
print(len(long_df) == len(long_df_not_covid) + len(long_df_covid))

Sanity check long format

Tweets about covid:
len(long_df_covid)=87136
Topics: [602. 601. 605. 604. 603. 606. 607.  nan] 

Tweets NOT about covid:
len(long_df_not_covid)=133371
Topics: [ nan 608.]
len(long_df)=220507
True


In [30]:
# Reorder columns
ordered_cols = [
    "tweet_id",
    "target_id",
    "covid_theme",
    "created_at",
    "name",
    "handle",
    "target",
    "old_text",
    "text",
    "url",
    "type",
    "retweets",
    "favorites",
    "topic",
    "subcat",
    "position",
    "frame",
    "theme_hardcoded",
    "date",
]
long_df_not_covid = long_df_not_covid[ordered_cols]
long_df_covid = long_df_covid[ordered_cols]

In [33]:
# Export to xlsx
long_df_not_covid.to_excel("src/resources/data/long_not_covid.xlsx")
long_df_covid.to_excel("src/resources/data/long_covid.xlsx")

In [34]:
# Some tweets seem to be in neither sets
# Investigation

# long_df.info()
# long_df_not_covid.info()
# long_df_covid.info()

concat_df = pd.concat([long_df_not_covid, long_df_covid])
concat_df_idx = set(concat_df["tweet_id"].values.tolist())

long_df_idx = set(long_df["tweet_id"].values.tolist())

rest_idx = long_df_idx - concat_df_idx

In [35]:
print(f"{len(concat_df_idx)=}")
print(f"{len(long_df_idx)=}")
print(f"{len(rest_idx)=}")

len(concat_df_idx)=130809
len(long_df_idx)=130809
len(rest_idx)=0


In [36]:
# Tweets that are in neither df
excl = long_df[long_df["tweet_id"].isin(rest_idx)]
excl["theme_hardcoded"].unique()

# Those are tweets that have 
#   topic is None
#   covid_theme == 1
#   theme_hardcoded is None
# -> simply uncoded tweets about covid
# should be included long_df_covid

array([], dtype=object)