# Exploratory Data Analysis

### Prepare dataset

Import necessary modules, and download tweets from database.

In [1]:
"""
EDA for database.
"""

import sys, os

sys.path.append(os.path.abspath(os.path.join("..", "..", "src")))

%load_ext autoreload
%autoreload 2

# To be able to export with interactive plots
import plotly.io as pio
pio.renderers.default = 'notebook_connected'

import re
import time

import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import widgets

import matplotlib
import pandas as pd
import numpy as np
from tqdm import tqdm

tqdm.pandas()

from common.database import Database
from common.app import App
from common.helpers import Helpers

from IPython.display import display

app_run = App(debug=True)
db = Database("tweets.db", app=app_run)

In [2]:
with db:
    tws = db.get_all_tweets()
df_all = Helpers.df_from_db(tws)
print(f"{len(df_all)=}")


len(df_all)=238523


In [3]:
# Select tweets about covid
df_yes = Helpers.categorize_df_covid(df_all.copy())
print(f"{len(df_yes)=}")

# select those that are NOT coded
df_uncoded = df_yes[
    ~(df_yes["topic"].isin(Helpers.topics_cov))
    & ~(df_yes["topic"].isin(Helpers.topics_not_cov))
].copy()
print(f"{len(df_uncoded)=}")

# and those that are coded
df_coded = df_all[(df_all["topic"].isin(Helpers.topics_cov))].copy()
print(f"{len(df_coded)=}")


len(df_yes)=101640
len(df_uncoded)=16566
len(df_coded)=85074


In [4]:
# Convert date to be handled
df_uncoded["date"] = df_uncoded["created_at"].apply(Helpers.convert_date)
df_uncoded["date"] = pd.to_datetime(df_uncoded["date"], format="%d/%m/%Y")

df_coded["date"] = df_coded["created_at"].apply(Helpers.convert_date)
df_coded["date"] = pd.to_datetime(df_coded["date"], format="%d/%m/%Y")

df_all["date"] = df_all["created_at"].apply(Helpers.convert_date)
df_all["date"] = pd.to_datetime(df_all["date"], format="%d/%m/%Y")

df_yes["date"] = df_yes["created_at"].apply(Helpers.convert_date)
df_yes["date"] = pd.to_datetime(df_yes["date"], format="%d/%m/%Y")


In [5]:
# Only keep tweets in the time range
df_all_sorted = Helpers.sort_timerange(df_all.copy())
print(f"{len(df_all_sorted)=}")

df_yes_sorted = Helpers.sort_timerange(df_yes.copy())
print(f"{len(df_yes_sorted)=}")

df_uncoded_sorted = Helpers.sort_timerange(df_uncoded.copy())
print(f"{len(df_uncoded_sorted)=}")

df_coded_sorted = Helpers.sort_timerange(df_coded.copy())
print(f"{len(df_coded_sorted)=}")


len(df_all_sorted)=185749
len(df_yes_sorted)=84233
len(df_uncoded_sorted)=2145
len(df_coded_sorted)=82088


### 1. General EDA about whole dataset

Period of interest: **01/01/2020 - 31/03/2021**

In [6]:
counts_all = df_all.groupby(["date"]).count()["tweet_id"]
counts_yes = df_yes.groupby(["date"]).count()["tweet_id"]
counts_coded = df_coded.groupby(["date"]).count()["tweet_id"]
counts_uncoded = df_uncoded.groupby(["date"]).count()["tweet_id"]

counts_all_sorted = df_all_sorted.groupby(["date"]).count()["tweet_id"]
counts_yes_sorted = df_yes_sorted.groupby(["date"]).count()["tweet_id"]
counts_coded_sorted = df_coded_sorted.groupby(["date"]).count()["tweet_id"]
counts_uncoded_sorted = df_uncoded_sorted.groupby(["date"]).count()["tweet_id"]


In [7]:
print("All retrieved tweets")
print(f"Total number of retrieved tweets: {len(df_all)}")
print(f"Total number of tweets about covid: {len(df_yes)}")
print(f"Total number of coded tweets: {len(df_coded)}")
print(f"Total number of uncoded tweets: {len(df_uncoded)}")

print("\nOver time range of interest")
print(f"Number of retrieved tweets in timerange: {len(df_all_sorted)}")
print(f"Number of tweets about covid in timerange: {len(df_yes_sorted)}")
print(f"Number of coded tweets in timerange: {len(df_coded_sorted)}")
print(f"Number of uncoded tweets in timerange: {len(df_uncoded_sorted)}")


All retrieved tweets
Total number of retrieved tweets: 238523
Total number of tweets about covid: 101640
Total number of coded tweets: 85074
Total number of uncoded tweets: 16566

Over time range of interest
Number of retrieved tweets in timerange: 185749
Number of tweets about covid in timerange: 84233
Number of coded tweets in timerange: 82088
Number of uncoded tweets in timerange: 2145


Note that new tweets from the actors are retrieved daily but not automatically added to the database. The real number of retrieved tweets (25.11.2021) is of ~350k tweets.

In [8]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=counts_all.index, y=counts_all, mode="lines", name="all retrieved tweets"
    )
)
fig.add_trace(
    go.Scatter(
        x=counts_yes.index, y=counts_yes, mode="lines", name="all tweets about covid"
    )
)
fig.add_trace(
    go.Scatter(
        x=counts_all.index, y=counts_all - counts_yes, mode="lines", name="difference"
    )
)  # recheck index

fig.add_vline(x="2020-01-01", line_color="red", line_dash="dash")
fig.add_vline(x="2021-03-31", line_color="red", line_dash="dash")
fig.update_layout(title="Tweets counts", xaxis_title="date", yaxis_title="count")
fig.show()


The cycle comes from the weeks. Actors tweet much less during the weekends.

#### 1.1 Spike of July 23

_Problem now resolved:_  
_This section refers to a spike in number of tweets for July 23 2020. See `descriptives/figures/july_23.png` and the `/interactive/july_23.ipynb` notebook for more details._

**Summary:**
About 1.7k tweets were indicated as being created on July 23 2020. This was abnormal. This section describes the consequent investigation. It appears that the `created_at` field in the database was incorrect. Those were mainly tweets from @UN and @WHO. This was corrected by updating the `created_at` field in the database with the correct value (see `/interactive/july_23.ipynb` notebook). 

In [9]:
# July 23
all_23 = df_all_sorted.loc[df_all_sorted["date"] == pd.to_datetime("2020-07-23")]
yes_23 = df_yes_sorted.loc[df_yes_sorted["date"] == pd.to_datetime("2020-07-23")]
display(all_23["handle"].value_counts().head(4))
display(yes_23["handle"].value_counts().head(4))


@gouvernementFR    39
@WHO               31
@EUHomeAffairs     25
@DHSCgovuk         19
Name: handle, dtype: int64

@WHO               30
@DHSCgovuk         19
@UN                11
@gouvernementFR    10
Name: handle, dtype: int64

Out of the 1738 tweets that day, more than 1148 (66%) were tweeted by 3 accounts: @WHO, @UN and @DrTedros. We also see that almost all those tweets were about covid.

In [10]:
# Check type of tweets
yes_23.loc[yes_23["handle"] == "@WHO"]["type"].value_counts()


Reply      22
Retweet     5
New         3
Name: type, dtype: int64

In [11]:
# Tweets that are replies to WHO itself
ry_c = len(
    yes_23.loc[
        (yes_23["handle"] == "@WHO")
        & (yes_23["type"] == "Reply")
        & (yes_23["old_text"].str.startswith("RY @WHO"))
    ]
)

print(f"Out of 496 replies, {ry_c} are replies to @WHO itself.")


Out of 496 replies, 22 are replies to @WHO itself.


As written above, out of 496 replies, 495 are replies to @WHO itself. However, one thing is bizarre. The `created_at` field corresponding to when a tweet is posted does not match the value from the Twitter website. 

The tweet [1293498878617178112](https://twitter.com/WHO/status/1293498878617178112) was posted on August 12th 2020, also it is reported as July 23th in the database. It appears that the same thing happened with many tweets for this day. At this stage, it is unclear how and why this happened. As an example of a "normal" day, let us look at May 28th.

In [12]:
# May 28
all_28 = df_all_sorted.loc[df_all_sorted["date"] == pd.to_datetime("2020-05-28")]
yes_28 = df_yes_sorted.loc[df_yes_sorted["date"] == pd.to_datetime("2020-05-28")]

# Display counts
display(all_28["handle"].value_counts().head(4))
display(yes_28["handle"].value_counts().head(4))

# Sample a unique value and check date
print("URL:", all_28.sample(1)["url"].item())
print("Date:", all_28.sample(1)["date"].item())


@WHO_Europe    45
@DHSCgovuk     43
@EU_Health     39
@enmarchefr    32
Name: handle, dtype: int64

@DHSCgovuk         36
@WHO_Europe        32
@MinSoliSante      28
@gouvernementFR    25
Name: handle, dtype: int64

URL: https://twitter.com/francediplo/status/1266020445243019264
Date: 2020-05-28 00:00:00


As expected, other days seem to correctly behave. The `created_at` field should be rechecked for every tweet with a non-missing `tweet_id`.

#### 1.2 Uncoded tweets

In [14]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=counts_yes.index, y=counts_yes, mode="lines", name="all tweets about covid"
    )
)
fig.add_trace(
    go.Scatter(x=counts_coded.index, y=counts_coded, mode="lines", name="coded tweets")
)
fig.add_trace(
    go.Scatter(
        x=counts_uncoded.index, y=counts_uncoded, mode="lines", name="uncoded tweets"
    )
)

fig.add_vline(x="2020-01-01", line_color="red", line_dash="dash")
fig.add_vline(x="2021-03-31", line_color="red", line_dash="dash")
fig.update_layout(title="Tweets counts", xaxis_title="date", yaxis_title="count")
fig.show()


We clearly see here that in the period of interest, almost all tweets that were retrieved and classified about covid were coded. The lacking ones are due to the automatic covid-classifier being updated. Some tweets that were previously not considered as being about covid now are and would need to be coded.  

In [15]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=counts_uncoded.index, y=counts_uncoded, mode="lines", name="uncoded tweets"
    )
)

fig.add_vline(x="2020-01-01", line_color="red", line_dash="dash")
fig.add_vline(x="2021-03-01", line_color="red", line_dash="dash")
fig.update_layout(title="Uncoded tweets count", xaxis_title="date", yaxis_title="count")
fig.show()


We can also look at the accounts that have the most uncoded tweets:

In [16]:
df_uncoded_sorted.groupby(["handle"]).count()["tweet_id"].sort_values(
    ascending=False
).head(6)


handle
@BAG_OFSP_UFSP     494
@BR_Sprecher       177
@EDI_DFI           163
@DHSCgovuk         117
@gouvernementFR    110
@DrTedros           77
Name: tweet_id, dtype: int64

We clearly see that the accounts that have the most uncoded tweets are swiss. This is no surprise, as they also tweet in German and Italian, languages that were consequently (for the second wave) not considered. The rest are due to the classifier update.

### 2. Coded tweets

This section focuses only on coded tweets of all categories (601 to 607), but not on the subcategories of 601.

In [17]:
# df of interest
df = df_coded_sorted
print(len(df), "tweets")


82088 tweets


#### 2.1 Count per category

How many tweets per category?

In [19]:
# Get size per category and proportions relative to total
topics_size = pd.DataFrame(df.groupby("topic").size(), columns=["size"])
topics_size["proportion"] = (topics_size["size"] / df.groupby("topic").size().sum()) * 100

# Plot barchart for each category
fig = px.bar(topics_size, x=topics_size.index, y="size", text="proportion")
fig.update_traces(texttemplate="%{text:.2f}%")
fig.update_layout(
    title="Tweets counts per category", xaxis_title="topic", yaxis_title="count"
)
fig.show()


As we can see, out of 82k tweets, almost 40% are labeled as "Health" (602). About 20% were related to mobility and almost non were directly speaking about education (606) or environment (607).


#### 2.2 Most active actor per category

Which actors speak the most in each category?

In [20]:
# Sort the dataframe
actors_size_df = pd.DataFrame(df.groupby(["topic", "handle"]).size(), columns=["size"])
actors_size_df = actors_size_df.reset_index() \
            .sort_values(["topic", "size"], ascending=[True, False]) \
            .set_index(["topic", "handle"])
actors_size_df["idx"] = actors_size_df.groupby(["topic"]).cumcount() + 1
display(actors_size_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,idx
topic,handle,Unnamed: 2_level_1,Unnamed: 3_level_1
601.0,@DHSCgovuk,1797,1
601.0,@gouvernementFR,1293,2
601.0,@WHO,864,3
601.0,@10DowningStreet,847,4
601.0,@ukhomeoffice,807,5
...,...,...,...
607.0,@UDCch,1,45
607.0,@WHOSEARO,1,46
607.0,@WHO_Europe,1,47
607.0,@cabinetofficeuk,1,48


In [21]:
# For now, only keep the 5 most active actors per category
# Create a new "other" cat for the others
actors_size_df_5 = actors_size_df.copy()
actors_size_df_5 = actors_size_df_5.reset_index("handle")
actors_size_df_5.loc[actors_size_df_5["idx"] > 5, "handle"] = "Others"
actors_size_df_5 = actors_size_df_5.groupby(["topic", "handle"]).sum()
actors_size_df_5 = actors_size_df_5.drop("idx", axis=1)
actors_size_df_5["proportion %"] = np.round(actors_size_df_5["size"] / actors_size_df_5.groupby("topic").sum()["size"] * 100, 2)
# display(actors_size_df_5)

# Plot it
actors_size_df_5 = actors_size_df_5.reset_index()
fig = px.bar(actors_size_df_5, x="topic", y="size", color="handle", text="proportion %")
fig.update_traces(texttemplate="%{text:.2f}%")
fig.update_layout(
    title="5 most active actors per category", xaxis_title="topic", yaxis_title="count"
)
fig.show()

#### 2.3 Level of actor

What level (National/EU/International) are the actors in each category?

In [22]:
actors = pd.read_excel(f"{app_run.root_dir}/src/resources/Actors_20210129.xlsx")
# Ignore error
actors = actors[["Level", "Country", "Actor_type", "Actor_Category", "Actor_name", "Twitter account"]]


Cell J20 is marked as a date but the serial value 12870011 is outside the limits for dates. The cell will be treated as an error.


Cell J21 is marked as a date but the serial value 8306632 is outside the limits for dates. The cell will be treated as an error.


Cell J25 is marked as a date but the serial value 5794546 is outside the limits for dates. The cell will be treated as an error.


Cell J55 is marked as a date but the serial value 5804800 is outside the limits for dates. The cell will be treated as an error.


Cell J56 is marked as a date but the serial value 3007983 is outside the limits for dates. The cell will be treated as an error.



In [23]:
actors = actors.rename({"Twitter account": "handle"}, axis=1)
actors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65 entries, 0 to 64
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Level           65 non-null     object
 1   Country         65 non-null     object
 2   Actor_type      65 non-null     object
 3   Actor_Category  65 non-null     object
 4   Actor_name      65 non-null     object
 5   handle          65 non-null     object
dtypes: object(6)
memory usage: 3.2+ KB


We see that some actors are missing in that list, we will need to complete the info.

In [24]:
# Need to have "handle" as column to join
# actors_size_df = actors_size_df.reset_index("handle")

In [25]:
# To check what we have
actors_size_df

Unnamed: 0_level_0,Unnamed: 1_level_0,size,idx
topic,handle,Unnamed: 2_level_1,Unnamed: 3_level_1
601.0,@DHSCgovuk,1797,1
601.0,@gouvernementFR,1293,2
601.0,@WHO,864,3
601.0,@10DowningStreet,847,4
601.0,@ukhomeoffice,807,5
...,...,...,...
607.0,@UDCch,1,45
607.0,@WHOSEARO,1,46
607.0,@WHO_Europe,1,47
607.0,@cabinetofficeuk,1,48


In [26]:
# Join
merged_df = actors_size_df.merge(actors, on="handle")

In [27]:
# Actors that are in the database but not in the original actors' list
actors_size_df = actors_size_df.reset_index()
missing_actors = actors_size_df[~actors_size_df["handle"].isin(merged_df["handle"].unique())]["handle"].unique()

for mis in missing_actors:
    print(f"- {mis}")

- @Sante_Gouv
- @WHOSEARO
- @reformparty_uk
- @Left_EU
- @Mitte_Centre
- @SantePubliqueFr


Some actors' information is missing, but their tweets are in the database:
- @Sante_Gouv: reason unknown  
- @WHOSEARO: was added as a proxy of @WHO, but the tweets of the latter could eventually also be retrieved
- @reformparty_uk: new name of @brexitparty_uk
- @Left_EU: reason unknown
- @Mitte_Centre: new name of @CVP_PDC
- @SantePubliqueFr: reason unknown 

In [28]:
# Redo merge with outer to have all infos
out_df = actors_size_df.merge(actors, how="outer", on="handle")
out_df

Unnamed: 0,topic,handle,size,idx,Level,Country,Actor_type,Actor_Category,Actor_name
0,601.0,@DHSCgovuk,1797.0,1.0,National,UK,Ministry,Health,Department of Health and Social Care
1,602.0,@DHSCgovuk,3848.0,2.0,National,UK,Ministry,Health,Department of Health and Social Care
2,603.0,@DHSCgovuk,200.0,16.0,National,UK,Ministry,Health,Department of Health and Social Care
3,604.0,@DHSCgovuk,317.0,10.0,National,UK,Ministry,Health,Department of Health and Social Care
4,605.0,@DHSCgovuk,507.0,4.0,National,UK,Ministry,Health,Department of Health and Social Care
...,...,...,...,...,...,...,...,...,...
435,605.0,@vertliberaux,5.0,68.0,National,Switzerland,Parliamentary,Political Party,Vert Libéraux
436,607.0,@vertliberaux,5.0,26.0,National,Switzerland,Parliamentary,Political Party,Vert Libéraux
437,602.0,@SantePubliqueFr,1.0,67.0,,,,,
438,603.0,@SantePubliqueFr,1.0,66.0,,,,,


In [29]:
# Sort topics and actors by most tweets
out_df = out_df.reset_index() \
    .sort_values(["topic", "idx"]) \
    .set_index(["topic", "handle", "idx"])
out_df = out_df.drop("index", axis=1)


In [30]:
# Random sample for sanity check
out_df.sample(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,size,Level,Country,Actor_type,Actor_Category,Actor_name
topic,handle,idx,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
601.0,@Europol,64.0,13.0,EU,France,Top Agency in Leadership Role,Law enforcement Agency,EUROPOL
602.0,@s_sommaruga,65.0,2.0,National,Switzerland,Head of the Executive,President,Simonetta Sommaruga


In [31]:
# Reset index to plot
out_df = out_df.reset_index()

# Get number of tweets per topic for each level
lvl_size = pd.DataFrame(out_df.groupby(["topic", "Level"])["size"].sum(), columns=["size"]).reset_index("Level")
lvl_size["proportion %"] = np.round(lvl_size["size"] / lvl_size.groupby("topic").sum()["size"]* 100, 2)
lvl_size

Unnamed: 0_level_0,Level,size,proportion %
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
601.0,EU,2159.0,14.11
601.0,International,1612.0,10.54
601.0,National,11530.0,75.35
602.0,EU,3845.0,12.11
602.0,International,13528.0,42.6
602.0,National,14384.0,45.29
603.0,EU,1985.0,23.6
603.0,International,634.0,7.54
603.0,National,5791.0,68.86
604.0,EU,2694.0,23.03


In [32]:
# Plot barchart for each category
fig = px.bar(lvl_size, x=lvl_size.index, y="size", color="Level", text="proportion %")
fig.update_traces(texttemplate="%{text:.2f}%")
fig.update_layout(
    title="Tweets counts per category per level", xaxis_title="topic", yaxis_title="count"
)
fig.show()

In [33]:
# Check wrt to proportion by actor
g = pd.DataFrame(actors.groupby("Level").size(), columns=["size"])
g["proportion %"] = np.round(g["size"] / g["size"].sum() * 100, 2)
g

Unnamed: 0_level_0,size,proportion %
Level,Unnamed: 1_level_1,Unnamed: 2_level_1
EU,16,24.62
International,5,7.69
National,44,67.69


To correctly interpret the above plot, it is important to know the proportion of levels in the actors list. Naturally, we expect to see those same proportion per category.
For example, we see that for tweets about mobility (601), national actors tweeted relatively more than the others. For health related tweets (602), international actors spoke more about that. This was mainly driven by the @WHO and @DrTedros. On the other hand, we can observe that economy (603) was primarily a national subject instead of being an international or european one. 

### 3. Mobility tweets

In this section, we focus on mobility (601) tweets and its subsections (60100 - 60116, position and frame).

In [34]:
# df of interest
df = df_coded_sorted.loc[df_coded_sorted["topic"] == 601]
print(len(df), "mobility tweets")

15805 mobility tweets


In [35]:
# Check missing values
display(df.info())

# Some missing subcategories
display(df.loc[df["subcat"].isna()])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15805 entries, 1285 to 234043
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   tweet_id         15805 non-null  object        
 1   covid_theme      15805 non-null  int64         
 2   created_at       15805 non-null  object        
 3   handle           15805 non-null  object        
 4   name             15805 non-null  object        
 5   old_text         11040 non-null  object        
 6   text             15547 non-null  object        
 7   url              15731 non-null  object        
 8   type             15805 non-null  object        
 9   retweets         15351 non-null  float64       
 10  favorites        15351 non-null  float64       
 11  topic            15805 non-null  float64       
 12  subcat           15801 non-null  float64       
 13  position         15801 non-null  float64       
 14  frame            15801 non-null  f

None

Unnamed: 0,tweet_id,covid_theme,created_at,handle,name,old_text,text,url,type,retweets,favorites,topic,subcat,position,frame,theme_hardcoded,date
64430,1344992253211762692,1,01/01/2021 13:01:56,@Interieur_Gouv,Ministère de l'Intérieur,RY @Interieur_Gouv: 🔴🇫🇷#Covid19 | Vous devez s...,🔴🇫🇷#Covid19 | Vous devez sortir durant le #cou...,https://twitter.com/Interieur_Gouv/status/1344...,Reply,17.0,35.0,601.0,,,,,2021-01-01
64431,1344992257812930560,1,01/01/2021 13:01:57,@Interieur_Gouv,Ministère de l'Intérieur,RY @Interieur_Gouv: 🔴🇫🇷#Covid19 | Si vous deve...,🔴🇫🇷#Covid19 | Si vous devez effectuer un dépla...,https://twitter.com/Interieur_Gouv/status/1344...,Reply,34.0,56.0,601.0,,,,,2021-01-01
64434,1344995823915048960,1,01/01/2021 13:16:07,@gouvernementFR,Gouvernement,RT @Interieur_Gouv: 🔴🇫🇷#Covid19 | Si vous deve...,🔴🇫🇷#Covid19 | Si vous devez effectuer un dépla...,https://twitter.com/gouvernementFR/status/1344...,Retweet,34.0,0.0,601.0,,,,,2021-01-01
97212,1375357132204302337,1,26/03/2021 08:01:07,@francediplo,France Diplomatie🇫🇷,RT @franceinfo: 🗣️ Allemagne: Angela Merkel s'...,🗣️ Allemagne: Angela Merkel s'excuse et annule...,https://twitter.com/francediplo/status/1375357...,Retweet,50.0,0.0,601.0,,,,,2021-03-26


It seems that those 4 french tweets were not coded. Otherwise, everything seems good. They also are the only mobility tweets missing the `position` and `frame` coding.

#### 3.1 Subcategories

##### 3.1.1 Share of subcategories

Proportion of each subcategory.

In [41]:
size = pd.DataFrame(df.groupby(["subcat"]).size(), columns=["size"])
size["proportion %"] = size["size"] / df.groupby(["subcat"]).size().sum() * 100

# Print biggest subcats
display(size.sort_values("proportion %", ascending=False).head(7))

# Plot
str_index = [str(int(idx)) for idx in size.index]
fig = px.bar(size, x=str_index, y="size", text="proportion %")
fig.update_traces(texttemplate="%{text:.2f}%", textposition="outside")
fig.update_layout(
    title="Tweets counts per subcategory", xaxis_title="subcategory", yaxis_title="count"
)
fig.show()

Unnamed: 0_level_0,size,proportion %
subcat,Unnamed: 1_level_1,Unnamed: 2_level_1
60105.0,3434,21.732802
60113.0,2844,17.998861
60115.0,2733,17.296374
60103.0,1497,9.474084
60116.0,1493,9.448769
60114.0,1452,9.189292
60104.0,609,3.854186


We clearly see that most mobility tweets are about local travel (60105), lockdown (60113), social distance (60115), international travel (60103), commercial flux (60116) and quarantine (60114).

#### 3.2 Position

In [42]:
size = pd.DataFrame(df.groupby(["position"]).size(), columns=["size"])
size["proportion %"] = size["size"] / df.groupby(["position"]).size().sum() * 100

# Plot
str_index = [str(int(idx)) for idx in size.index]
fig = px.bar(size, x=str_index, y="size", text="proportion %")
fig.update_traces(texttemplate="%{text:.2f}%")
fig.update_layout(
    title="Tweets counts per position", xaxis_title="position", yaxis_title="count"
)
fig.show()

#### 3.3 Frame

In [43]:
size = pd.DataFrame(df.groupby(["frame"]).size(), columns=["size"])
size["proportion %"] = size["size"] / df.groupby(["frame"]).size().sum() * 100

# Plot
str_index = [str(int(idx)) for idx in size.index]
fig = px.bar(size, x=str_index, y="size", text="proportion %")
fig.update_traces(texttemplate="%{text:.2f}%")
fig.update_layout(
    title="Tweets counts per frame", xaxis_title="frame", yaxis_title="count"
)
fig.show()