## Prepare dataset

Import necessary modules, and download tweets from database.

In [53]:
"""
EDA for database.
"""

import sys, os

sys.path.append(os.path.abspath(os.path.join("..", "src")))

%load_ext autoreload
%autoreload 2
import re
import time

import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import widgets

import matplotlib
import pandas as pd
import numpy as np
from tqdm import tqdm

tqdm.pandas()

from common.database import Database
from common.app import App
from common.helpers import Helpers

app_run = App(debug=True)
db = Database("tweets.db", app=app_run)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
def convert_date(datestr):
    datestr = datestr.split(" ")[0]
    datestr = datestr.replace("-", "/")

    # If US format (2020/04/01), convert to EU
    datestr_split = datestr.split("/")
    if len(datestr_split[0]) == 4:
        return "/".join(datestr_split[::-1])
    return datestr

In [3]:
with db:
    tws = db.get_all_tweets()
print(len(tws))
df_all = Helpers.df_from_db(tws)

240375


In [4]:
# get only tweets about covid
df_yes = df_all[(df_all["covid_theme"] == 1) & ~(df_all["theme_hardcoded"] == "0")].copy()

# select those that are NOT coded
df_uncoded = df_yes[
    ~(df_yes["topic"].isin(Helpers.topics_cov))
    & ~(df_yes["topic"].isin(Helpers.topics_not_cov))
].copy()
len(df_uncoded)  # 27_998

# and those that are coded
df_coded = df_yes[(df_yes["topic"].isin(Helpers.topics_cov))].copy()
len(df_coded)  # 65_999

65999

In [5]:
# Convert date to be handled
df_uncoded["date"] = df_uncoded["created_at"].apply(convert_date)
df_uncoded["date"] = pd.to_datetime(df_uncoded["date"], format="%d/%m/%Y")

df_coded["date"] = df_coded["created_at"].apply(convert_date)
df_coded["date"] = pd.to_datetime(df_coded["date"], format="%d/%m/%Y")

df_all["date"] = df_all["created_at"].apply(convert_date)
df_all["date"] = pd.to_datetime(df_all["date"], format="%d/%m/%Y")

df_yes["date"] = df_yes["created_at"].apply(convert_date)
df_yes["date"] = pd.to_datetime(df_yes["date"], format="%d/%m/%Y")

In [6]:
start = "31/12/2019"
end = "01/04/2021"

df_yes_sorted = df_yes[
    (df_yes["date"] > pd.to_datetime(start, format="%d/%m/%Y"))
    & (df_yes["date"] < pd.to_datetime(end, format="%d/%m/%Y"))
]
len(df_yes_sorted)  # 186_864 tweets

df_all_sorted = df_all[
    (df_all["date"] > pd.to_datetime(start, format="%d/%m/%Y"))
    & (df_all["date"] < pd.to_datetime(end, format="%d/%m/%Y"))
]
len(df_all_sorted)  # 78_048 tweets

df_uncoded_sorted = df_uncoded[
    (df_uncoded["date"] > pd.to_datetime(start, format="%d/%m/%Y"))
    & (df_uncoded["date"] < pd.to_datetime(end, format="%d/%m/%Y"))
]
len(df_uncoded_sorted)  # 13_138 tweets

df_coded_sorted = df_coded[
    (df_coded["date"] > pd.to_datetime(start, format="%d/%m/%Y"))
    & (df_coded["date"] < pd.to_datetime(end, format="%d/%m/%Y"))
]
len(df_coded_sorted)  # 64_804 tweets

64804

## General EDA about whole database

General information about retrieved tweets

Period of interest: 01/01/2020 - 31/03/2021

Total number of retrieved tweets: 240'375
Total number of tweets in period of interest: 186'864
Total number of tweets in period of interest about COVID: 78'048
Total number of coded tweets: 64'804
Total number of remaining uncoded tweets: 13'138

In [56]:
counts_all = df_all.groupby(["date"]).count()["tweet_id"]
counts_yes = df_yes.groupby(["date"]).count()["tweet_id"]
counts_coded = df_coded.groupby(["date"]).count()["tweet_id"]
counts_uncoded = df_uncoded.groupby(["date"]).count()["tweet_id"]
counts_uncoded_sorted = df_uncoded_sorted.groupby(["date"]).count()["tweet_id"]


In [32]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=counts_all.index, y=counts_all, mode="lines", name="all retrieved tweets"))
fig.add_trace(go.Scatter(x=counts_all.index, y=counts_yes, mode="lines", name="all tweets about covid"))
# fig.add_trace(go.Scatter(x=counts_all.index, y=counts_coded, mode="lines", name="coded tweets"))
# fig.add_trace(go.Scatter(x=counts_all.index, y=counts_uncoded, mode="lines", name="uncoded tweets"))

fig.add_vline(x="2020-01-01", line_color="red", line_dash="dash")
fig.add_vline(x="2021-03-01", line_color="red", line_dash="dash")
fig.update_layout(title="Tweets counts", xaxis_title="date", yaxis_title="count")
fig.show()

In [33]:
fig = go.Figure()
# fig.add_trace(go.Scatter(x=counts_all.index, y=counts_all, mode="lines", name="all retrieved tweets"))
fig.add_trace(go.Scatter(x=counts_all.index, y=counts_yes, mode="lines", name="all tweets about covid"))
fig.add_trace(go.Scatter(x=counts_all.index, y=counts_coded, mode="lines", name="coded tweets"))
fig.add_trace(go.Scatter(x=counts_all.index, y=counts_uncoded, mode="lines", name="uncoded tweets"))

fig.add_vline(x="2020-01-01", line_color="red", line_dash="dash")
fig.add_vline(x="2021-03-01", line_color="red", line_dash="dash")
fig.update_layout(title="Tweets counts", xaxis_title="date", yaxis_title="count")
fig.show()

We'll get back to that spike of July 23 later. The cycle comes from the weeks. Actors tweet much less during the weekends.

## EDA - Uncoded tweets

In [51]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=counts_all.index, y=counts_uncoded, mode="lines", name="uncoded tweets"))

fig.add_vline(x="2020-01-01", line_color="red", line_dash="dash")
fig.add_vline(x="2021-03-01", line_color="red", line_dash="dash")
fig.update_layout(title="Uncoded tweets count", xaxis_title="date", yaxis_title="count")
fig.show()

In [71]:
df_uncoded_sorted.groupby(["handle"]).count()["tweet_id"].sort_values(ascending=False).head(9)

handle
@WHO               1790
@DHSCgovuk         1674
@BAG_OFSP_UFSP     1611
@gouvernementFR    1265
@BR_Sprecher        778
@EUHomeAffairs      592
@WHOSEARO           579
@EDI_DFI            574
@DrTedros           427
Name: tweet_id, dtype: int64

8 accounts have more than 500 uncoded tweets during the period of interest. For some (like @WHOSEARO), the origin of those tweets is well known and this behaviour was expected. In this case, we began retrieving tweets from @WHOSEARO in order to try to capture the tweets from @WHO and thus bypass the ~3000 tweets API limit. To do so, we included @WHOSEARO in the list of actors to retrieve. As a consequence, all the tweets for this actor for the whole period were retrieved when there were no more API limitations.

For @WHO, it is believed that the same 3000 limit is the cause of the high proportion of uncoded tweets before April 2020. 

The origin of uncoded tweets after April 2020 is the implementation of the new classifier (classifying tweets as being about covid or not) which was deployed in August 2021, *after* the tweets were submitted for manual coding.

Those reasons are however not sufficient to explain why some of the tweets still are uncoded. This needs to be investigated further.

In [73]:
most_uncoded_by_handles = df_uncoded_sorted.groupby(["handle"]).count()["tweet_id"].sort_values(ascending=False).index

counts_uncoded_sorted_WHO = df_uncoded_sorted[df_uncoded_sorted["handle"]=="@WHO"].groupby(["date"]).count()["tweet_id"]

handle = widgets.Dropdown(
    description='Handle:   ',
    value='@WHO',
    options=most_uncoded_by_handles
)


trace1 = go.Scatter(x=counts_uncoded_sorted.index, y=counts_uncoded_sorted_WHO, mode="lines", name="uncoded tweets")
# trace2 = go.Scatter(x=counts_coded.index, y=counts_coded, mode="lines", name="uncoded tweets")

g = go.FigureWidget(data=[trace1],
                    layout=go.Layout(
                        title=dict(
                            text='Uncoded tweets count per actor'
                        ),
                        xaxis_title="date",
                        yaxis_title="count",
                        yaxis_range=(0, 70),
                        barmode='overlay'
                    ))

def validate():
    return handle.value in df_uncoded_sorted['handle'].unique()

def response(change):
    if validate():
        counts_actor = df_uncoded_sorted[df_uncoded_sorted["handle"]==handle.value].groupby(["date"]).count()["tweet_id"]
        with g.batch_update():
            g.data[0].y = counts_actor
            g.data[0].x = counts_actor.index


handle.observe(response, names="value")

container = widgets.HBox([handle])
widgets.VBox([container, g])


VBox(children=(HBox(children=(Dropdown(description='Handle:   ', options=('@WHO', '@DHSCgovuk', '@BAG_OFSP_UFS…