In [1]:
"""
Extract mentions from tweets and reshape into long format
"""

import sys, os

sys.path.append(os.path.abspath(os.path.join("..", "src")))

%load_ext autoreload
%autoreload 2
import re
import time

import pandas as pd
import numpy as np
from tqdm import tqdm

tqdm.pandas()

from common.database import Database
from common.app import App
from common.helpers import Helpers

app_run = App(debug=True)
db = Database("tweets.db", app=app_run)

In [2]:
with db:
    tws = db.get_all_tweets()
print(len(tws))
df_all = Helpers.df_from_db(tws)

238523


In [3]:
# Convert date
df_all["date"] = df_all["created_at"].apply(Helpers.convert_date)
df_all["date"] = pd.to_datetime(df_all["date"], format="%d/%m/%Y")

In [4]:
start = "31/12/2019"
end = "01/04/2021"

df_all_sorted = df_all[
    (df_all["date"] > pd.to_datetime(start, format="%d/%m/%Y"))
    & (df_all["date"] < pd.to_datetime(end, format="%d/%m/%Y"))
]
print(f"{len(df_all_sorted)=}") 

len(df_all_sorted)=185749


In [26]:
def extract_mentions(txt):
    """
    Extract the twitter mentions of a string

    Returns the number of unqiue mentions and the unique mentions
    """

    if not txt:
        return

    r1 = r"@[A-Za-z0-9_-]*"
    mentions = set(re.findall(r1, txt))
    return len(mentions), mentions

In [29]:
# Testing
txt = "@manue123 ahahh @olive test @opotrac shouldnot be @potrac counted @potrac @3t6___adsasd @wil-bail"
extract_mentions(txt)

(6,
 {'@3t6___adsasd', '@manue123', '@olive', '@opotrac', '@potrac', '@wil-bail'})

In [6]:
# Get maximum number of mentions to know how many columns needed
max_old = df_all_sorted["old_text"].str.count(r"@\w+").max()  # 22
max_txt = df_all_sorted["text"].str.count(r"@\w+").max()  # 26
max(max_old, max_txt)

26.0

In [153]:
# Create the new columns
df = df_all_sorted.assign(**{f"m_{i:02}": np.nan for i in range(1, 27)})

In [154]:
def insert_mentions(row):
    old = row["old_text"] if row["old_text"] is not None else " "
    cur = row["text"] if row["text"] is not None else " "

    txt = old + " " + cur
    _, mentions = extract_mentions(txt)

    for i, m in enumerate(mentions, start=1):
        row.loc[f"m_{i:02}"] = m
    return row

In [155]:
df = df.progress_apply(insert_mentions, axis=1)

100%|██████████| 185749/185749 [00:54<00:00, 3405.35it/s]


In [161]:
df.sample(5).loc[:, "m_01":"m_27"]

Unnamed: 0,m_01,m_02,m_03,m_04,m_05,m_06,m_07,m_08,m_09,m_10,...,m_18,m_19,m_20,m_21,m_22,m_23,m_24,m_25,m_26,m_27
93140,@AnnaFotyga_PE,,,,,,,,,,...,,,,,,,,,,
82252,@DrKeithRowley,@DrTedros,@WHO,,,,,,,,...,,,,,,,,,,
51284,@RobertJenrick,,,,,,,,,,...,,,,,,,,,,
229407,@Facebook,@Google,@YouTube,@MunSecConf,@Pinterest,@tiktok_us,@Twitter,@TencentGlobal,@DrTedros,,...,,,,,,,,,,
46359,@WHO,,,,,,,,,,...,,,,,,,,,,


In [163]:
df.loc[:, "m_01":"m_27"].count()

m_01    142348
m_02     60701
m_03     23912
m_04     10780
m_05      5533
m_06      3251
m_07      1990
m_08      1240
m_09       837
m_10       552
m_11       349
m_12       206
m_13       124
m_14        81
m_15        45
m_16        28
m_17        18
m_18        13
m_19        11
m_20         8
m_21         7
m_22         5
m_23         4
m_24         2
m_25         2
m_26         2
m_27         2
dtype: int64