### 0. Prepare

In [1]:
"""
Retrieve special sets of tweets given needs
"""

import sys, os

sys.path.append(os.path.abspath(os.path.join("..", "..", "src")))

%load_ext autoreload
%autoreload 2
import re
import time

import pandas as pd
import numpy as np
from tqdm import tqdm

tqdm.pandas()

from common.database import Database
from common.app import App
from common.helpers import Helpers

app_run = App(debug=True)
db = Database("tweets.db", app=app_run)

In [2]:
# Get all tweets from db
with db:
    tws = db.get_all_tweets()
print(len(tws))
df_all = Helpers.df_from_db(tws)

238523


In [3]:
# OLD WAY, SHOULD NOT BE USED
# get only tweets about covid
df_yes = df_all[(df_all["covid_theme"] == 1) & ~(df_all["theme_hardcoded"] == "0")].copy()
print(f"{len(df_yes)=}")

len(df_yes)=88651


In [4]:
# New way, should be used
df_yes = Helpers.categorize_df_covid(df_all.copy())
print(f"{len(df_yes)=}")

len(df_yes)=101640


In [5]:
# select those that are NOT coded
df_uncoded = df_yes[
    ~(df_yes["topic"].isin(Helpers.topics_cov))
    & ~(df_yes["topic"].isin(Helpers.topics_not_cov))
].copy()
print(f"{len(df_uncoded)=}")

# and those that are coded
df_coded = df_yes[(df_yes["topic"].isin(Helpers.topics_cov))].copy()
print(f"{len(df_coded)=}")

len(df_uncoded)=16566
len(df_coded)=85074


In [6]:
# Convert date to be handled
df_uncoded["date"] = df_uncoded["created_at"].apply(Helpers.convert_date)
df_uncoded["date"] = pd.to_datetime(df_uncoded["date"], format="%d/%m/%Y")

df_coded["date"] = df_coded["created_at"].apply(Helpers.convert_date)
df_coded["date"] = pd.to_datetime(df_coded["date"], format="%d/%m/%Y")

df_all["date"] = df_all["created_at"].apply(Helpers.convert_date)
df_all["date"] = pd.to_datetime(df_all["date"], format="%d/%m/%Y")

df_yes["date"] = df_yes["created_at"].apply(Helpers.convert_date)
df_yes["date"] = pd.to_datetime(df_yes["date"], format="%d/%m/%Y")

In [7]:
# Only keep tweets in the time range
df_all_sorted = Helpers.sort_timerange(df_all.copy())
print(f"{len(df_all_sorted)=}") 

df_yes_sorted = Helpers.sort_timerange(df_yes.copy())
print(f"{len(df_yes_sorted)=}") 

df_uncoded_sorted = Helpers.sort_timerange(df_uncoded.copy())
print(f"{len(df_uncoded_sorted)=}") 

df_coded_sorted = Helpers.sort_timerange(df_coded.copy())
print(f"{len(df_coded_sorted)=}")

len(df_all_sorted)=185749
len(df_yes_sorted)=84233
len(df_uncoded_sorted)=2145
len(df_coded_sorted)=82088


### 1. Total count
In the period of interest, for each actor,  
      get total number of tweets  
      get total number of tweets about covid  

In [8]:
# Number of tweets per handle
g_all = pd.DataFrame(df_all_sorted.groupby("handle").size())
g_all = g_all.rename({0: "count_all"}, axis=1)

g_yes = pd.DataFrame(df_yes_sorted.groupby("handle").size())
g_yes = g_yes.rename({0: "count_covid"}, axis=1)

# Merge dataframes
count = g_all.join(g_yes, on="handle").fillna(0)
count["count_covid"] = count["count_covid"].astype(int) 

# Add columns
count["count_not_covid"] = count["count_all"] - count["count_covid"]
count["proportion"] = count["count_covid"] / count["count_all"]

In [9]:
count.sort_values("proportion", ascending=False)

Unnamed: 0_level_0,count_all,count_covid,count_not_covid,proportion
handle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
@santeprevention,297,297,0,1.000000
@MinSoliSante,3060,3057,3,0.999020
@GUENGL,418,416,2,0.995215
@CVP_PDC,68,66,2,0.970588
@WHOSEARO,549,531,18,0.967213
...,...,...,...,...
@Left_EU,3532,336,3196,0.095130
@fedpolCH,341,27,314,0.079179
@Mitte_Centre,1134,80,1054,0.070547
@SantePubliqueFr,145,2,143,0.013793


In [10]:
# to excel 
count.to_excel("src/resources/data/v2_count_actors_20211124_OG.xlsx")

## 2. Weekly total count
In the period of interest, for each week, for each actor,  
      get total number of tweets  
      get total number of tweets about covid  

In [31]:
# Extract year and week
df_all_sorted["year"] = df_all_sorted["date"].dt.year
df_all_sorted["week"] = df_all_sorted["date"].dt.isocalendar().week
df_yes_sorted["year"] = df_yes_sorted["date"].dt.year
df_yes_sorted["week"] = df_yes_sorted["date"].dt.isocalendar().week

In [37]:
# Correction for year 2021, week 53 of 2020 is counted as 53 for 2021 also
# Convert week 53 of 2021 into week 1
df_all_sorted.loc[(df_all_sorted["year"] == 2021) & (df_all_sorted["week"] == 53), "week"] = 1
df_yes_sorted.loc[(df_yes_sorted["year"] == 2021) & (df_yes_sorted["week"] == 53), "week"] = 1

In [40]:
df_yes_sorted[(df_yes_sorted["year"] == 2021) & (df_yes_sorted["week"] == 1)]

Unnamed: 0,tweet_id,covid_theme,created_at,handle,name,old_text,text,url,type,retweets,favorites,topic,subcat,position,frame,theme_hardcoded,date,year,week
64353,1344798152763858946,0,01/01/2021 00:10:39,@douane_france,Direction générale des douanes et droits indir...,,"Le #Brexit, c'est parti ! La nouvelle frontièr...",https://twitter.com/douane_france/status/13447...,New,44.0,53.0,601.0,60103.0,1.0,3.0,,2021-01-01,2021,1
64356,1344827432424136706,0,01/01/2021 02:07:00,@UN,United Nations,,"2020 has been a year of trials, tragedies and ...",https://twitter.com/UN/status/1344827432424136706,New,174.0,625.0,602.0,,,,,2021-01-01,2021,1
64360,1344916269271695362,1,01/01/2021 08:00:00,@EU_Commission,European Commission 🇪🇺,,Who decides on the EU budget and how does it w...,https://twitter.com/EU_Commission/status/13449...,New,67.0,160.0,603.0,,,,,2021-01-01,2021,1
64363,1344917023231537153,0,01/01/2021 08:03:00,@UN,United Nations,,#HappyNewYear!\n\nWishing all our followers a ...,https://twitter.com/UN/status/1344917023231537153,New,209.0,894.0,602.0,,,,,2021-01-01,2021,1
64368,1344923817378910208,1,01/01/2021 08:30:00,@DHSCgovuk,Department of Health and Social Care,,The #COVID19 vaccine is a huge step forward in...,https://twitter.com/DHSCgovuk/status/134492381...,New,18.0,85.0,602.0,,,,,2021-01-01,2021,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233467,1361299723001360391,1,04/01/2021 14:03:45,@Conservatives,Conservatives,RT @UniofOxford: We welcome today's news of th...,We welcome today's news of the first #OxfordVa...,https://twitter.com/Conservatives/status/13460...,Retweet,462.0,0.0,602.0,,,,,2021-01-04,2021,1
233768,1347183139626643456,1,07/01/2021 14:07:44,@DHSCgovuk,Department of Health and Social Care,"RT @BLMK_CCGs: Yesterday, Paul Wheatley, 83, w...","Yesterday, Paul Wheatley, 83, was the first pe...",https://twitter.com/DHSCgovuk/status/134718313...,Retweet,9.0,0.0,602.0,,,,,2021-01-07,2021,1
233809,1346779976952139776,1,06/01/2021 11:25:43,@EU_Health,EU_HEALTH - #SafeVaccines,RT @DSMeu: 🆕The Belgian #coronalert app now wo...,🆕The Belgian #coronalert app now works across ...,https://twitter.com/EU_Health/status/134677997...,Retweet,63.0,0.0,604.0,,,,,2021-01-06,2021,1
233855,1346815525960232962,1,06/01/2021 13:46:58,@RenewEurope,Renew Europe,,💉 #CovidVaccine: Renew Europe calls on the @EM...,https://twitter.com/RenewEurope/status/1346815...,New,7.0,23.0,602.0,,,,,2021-01-06,2021,1


In [41]:
# Get tweets counts per handle per year per week
g_all = pd.DataFrame(df_all_sorted.groupby(["handle", "year", "week"]).size())
g_all = g_all.rename({0: "count_all"}, axis=1)

g_yes = pd.DataFrame(df_yes_sorted.groupby(["handle", "year", "week"]).size())
g_yes = g_yes.rename({0: "count_covid"}, axis=1)

# Merge dataframes
weekly_count = g_all.join(g_yes, on=["handle", "year", "week"]).fillna(0)

# Add columns
weekly_count["count_covid"] = weekly_count["count_covid"].astype(int)
weekly_count["count_not_covid"] = weekly_count["count_all"] - weekly_count["count_covid"]
weekly_count

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count_all,count_covid,count_not_covid
handle,year,week,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
@10DowningStreet,2020,1,1,0,1
@10DowningStreet,2020,2,9,0,9
@10DowningStreet,2020,3,17,0,17
@10DowningStreet,2020,4,24,0,24
@10DowningStreet,2020,5,28,0,28
...,...,...,...,...,...
@vonderleyen,2021,9,20,10,10
@vonderleyen,2021,10,16,6,10
@vonderleyen,2021,11,30,12,18
@vonderleyen,2021,12,35,17,18


In [42]:
# To excel
weekly_count.to_excel("src/resources/data/v2_weekly_count_actors_20211125_OG.xlsx")