In [68]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import datetime
from tqdm.notebook import tqdm
from IPython.core.debugger import set_trace
import itertools
from pathlib import Path  
import glob
import os

## We have a few options to reduce the amount of tweets received:
1. reduce number of dates used (as in use a smaller interval like first half year of corona)
2. reduce number of keywords used. Mainly the last part of the keywords take a lot of time
3. take only most popular tweets?

In [51]:
dates_list = ['15/02/2022','25/01/2022','14/01/2022','18/12/2021','14/12/2021','26/11/2021','12/11/2021','02/11/2021',
              '14/09/2021','13/08/2021','09/07/2021','18/06/2021','28/05/2021','11/05/2021','20/04/2021','13/04/2021',
              '23/03/2021','09/03/2021','23/02/2021','02/02/2021','20/01/2021','12/01/2021','14/12/2020','17/11/2020',
              '03/11/2020','13/10/2020','28/09/2020','25/09/2020','01/09/2020','18/08/2020','06/08/2020','24/06/2020',
              '19/05/2020','06/05/2020','21/04/2020','31/03/2020','23/03/2020','15/03/2020','12/03/2020']
dates_list.reverse()
extra_filts = ' since:{0} until:{1} lang:nl'
# key_words = list(map(lambda x: x.strip(), '(corona maatregel) OR (corona maatregelen) OR coronamaatregelen OR coronamaatregel OR coronaregel OR coronaregels OR coronamaatregels OR (corona maatregels)\
#     OR (covid maatregel) OR (covid maatregelen) OR covidmaatregelen OR covidmaatregel OR covidregel OR covidregels OR covidmaatregels OR (covid maatregels)\
#     OR (covid19 maatregel) OR (covid19 maatregelen) OR covid19maatregelen OR covid19maatregel OR covid19regel OR covid19regels OR covid19maatregels OR (covid19 maatregels)\
#     OR (covid-19 maatregel) OR (covid-19 maatregelen) OR covid-19maatregelen OR covid-19maatregel OR covid-19regel OR covid-19regels OR covid-19maatregels OR (covid-19 maatregels)\
#     OR (sars-cov-2 maatregel) OR (sars-cov-2 maatregelen) OR sars-cov-2maatregelen OR sars-cov-2maatregel OR sars-cov-2regel OR sars-cov-2regels OR sars-cov-2maatregels OR (sars-cov-2 maatregels)\
#     OR lockdown OR persconferentie OR (1,5 meter) OR anderhalvemeter OR mondkapje OR mondkap OR mondmasker OR masker OR blijfthuis OR thuiswerken OR quarantaine OR thuisquarantaine OR (thuis quarantaine) OR testenvoortoegang OR (testen voor toegang)'.split('OR')))
key_words = list(map(lambda x: x.strip(), '(corona maatregel) OR (corona maatregelen) OR coronamaatregelen OR coronamaatregel OR coronaregel OR coronaregels OR coronamaatregels OR (corona maatregels)\
    OR (covid maatregel) OR (covid maatregelen) OR covidmaatregelen OR covidmaatregel OR covidregel OR covidregels OR covidmaatregels OR (covid maatregels)\
    OR (covid19 maatregel) OR (covid19 maatregelen) OR covid19maatregelen OR covid19maatregel OR covid19regel OR covid19regels OR covid19maatregels OR (covid19 maatregels)\
    OR (covid-19 maatregel) OR (covid-19 maatregelen) OR covid-19maatregelen OR covid-19maatregel OR covid-19regel OR covid-19regels OR covid-19maatregels OR (covid-19 maatregels)\
    OR (sars-cov-2 maatregel) OR (sars-cov-2 maatregelen) OR sars-cov-2maatregelen OR sars-cov-2maatregel OR sars-cov-2regel OR sars-cov-2regels OR sars-cov-2maatregels OR (sars-cov-2 maatregels)'.split('OR')))



In [84]:
def append_to_query_str(q_str, k_word):
    return '%s OR %s'%(q_str, k_word) if len(q_str) != 0 else k_word

def constr_query_groups(key_words, extra_filts):
    '''Divides keywords over several queries so the max query length
    and OR count set by Twitter are never reached'''
    query_lim = 400
    or_lim = 49
    q_groups = []
    q_str = ''
    
    i=0
    for k_word in key_words:
                  
        if not ((len(append_to_query_str(q_str, k_word)) + len(extra_filts)) > query_lim) and not i-1 > 50:
            q_str = append_to_query_str(q_str, k_word)
        else:
            q_groups.append(q_str +extra_filts)
            q_str = k_word
            i=0
    q_groups.append(q_str+extra_filts)
    
    return q_groups

def get_all_filenames(path, wildcard):
    filepath = Path(path)  
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    full_path = os.path.join(filepath, wildcard)
    return glob.glob(os.path.join(filepath, wildcard))

def concat_dfs_from_files(files):
    df_from_each_file = (pd.read_csv(f) for f in files)
    return pd.concat(df_from_each_file, ignore_index=True)

def save_df(df, path):
    filepath = Path(path)  
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    df.to_csv(filepath)
    
    
    
        

In [85]:
# Creating list to append tweet data to
tweets_list = []
query_groups = constr_query_groups(key_words, extra_filts)
path_prefix = 'data/maatregelen_tweets_per_persco_7dagen_'
data_path = 'data'
file_path = 'maatregelen_tweets_per_persco_7dagen_*'


for date in tqdm(dates_list):
    
    date = datetime.datetime.strptime(date, '%d/%m/%Y')
    end_date = date + datetime.timedelta(days=7)
    end_date = datetime.datetime.strftime(end_date, "%Y-%m-%d")
    date = date.strftime('%Y-%m-%d')
    print(f'{date} - {end_date}')
    
    for q in tqdm(query_groups):
        full_q = q.format(date, end_date)
        for i, tweet in enumerate(sntwitter.TwitterSearchScraper(full_q).get_items()):
            tweets_list.append(tweet)
    
    tweets_df = pd.DataFrame(tweets_list)
    # filepath = Path(f'{path_prefix}{date}_{end_date}.csv')  
    # filepath.parent.mkdir(parents=True, exist_ok=True)  
    # tweets_df.to_csv(filepath)
    save_df(tweets_df, f'{path_prefix}{date}_{end_date}.csv')
    tweets_list = []
    

  0%|          | 0/39 [00:00<?, ?it/s]

2020-03-12 - 2020-03-19


  0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [55]:
tweets_df.describe()

Unnamed: 0.1,Unnamed: 0,id,replyCount,retweetCount,likeCount,quoteCount,conversationId,retweetedTweet,inReplyToTweetId,cashtags
count,3345.0,3345.0,3345.0,3345.0,3345.0,3345.0,3345.0,0.0,537.0,0.0
mean,1672.0,1.371923e+18,0.874738,4.832586,11.188042,0.511809,1.371125e+18,,1.373904e+18,
std,965.762652,8.436612e+16,4.536222,26.189799,67.65632,3.049815,8.443525e+16,,8.835174e+16,
min,0.0,1.032262e+18,0.0,0.0,0.0,0.0,1.032262e+18,,1.107356e+18,
25%,836.0,1.312002e+18,0.0,0.0,0.0,0.0,1.311969e+18,,1.302894e+18,
50%,1672.0,1.35701e+18,0.0,0.0,0.0,0.0,1.356923e+18,,1.362001e+18,
75%,2508.0,1.461067e+18,1.0,1.0,2.0,0.0,1.459513e+18,,1.463236e+18,
max,3344.0,1.495844e+18,103.0,538.0,1561.0,64.0,1.495844e+18,,1.495364e+18,


In [3]:
tweets_df = pd.read_csv(filepath)

In [4]:
tweets_df

Unnamed: 0.1,Unnamed: 0,url,date,content,renderedContent,id,user,replyCount,retweetCount,likeCount,...,media,retweetedTweet,quotedTweet,inReplyToTweetId,inReplyToUser,mentionedUsers,coordinates,place,hashtags,cashtags
0,0,https://twitter.com/bcagathos/status/123943955...,2020-03-16 06:33:13+00:00,--- COVID-19: Update 15-03-2020. ALLE BASKETBA...,--- COVID-19: Update 15-03-2020. ALLE BASKETBA...,1239439559710117888,"{'username': 'bcagathos', 'id': 74278274, 'dis...",0,0,0,...,,,,,,,,,,
1,1,https://twitter.com/POL_NO_Fryslan/status/1239...,2020-03-15 19:06:38+00:00,#Nederland - \n\nAangifte doen tijdens corona ...,#Nederland - \n\nAangifte doen tijdens corona ...,1239266774366982162,"{'username': 'POL_NO_Fryslan', 'id': 304485947...",0,3,4,...,,,,,,,"{'longitude': 5.55, 'latitude': 52.31666667}","{'fullName': 'Zeewolde, Nederland', 'name': 'Z...",['Nederland'],
2,2,https://twitter.com/CrisisNed20/status/1239232...,2020-03-15 16:52:05+00:00,"Aanvullende maatregelen onderwijs, horeca, spo...","Aanvullende maatregelen onderwijs, horeca, spo...",1239232915642466307,"{'username': 'CrisisNed20', 'id': 93439912, 'd...",0,4,0,...,,,,,,,"{'longitude': 5.357081, 'latitude': 51.3999536}","{'fullName': 'Eindhoven, Nederland', 'name': '...","['coronavirus', 'COVID19NL', 'covid19']",
3,3,https://twitter.com/WilliamWilde2/status/12391...,2020-03-15 11:56:40+00:00,@_roedel 14/03/2020:\nHet blijkt dat met de nu...,@_roedel 14/03/2020:\nHet blijkt dat met de nu...,1239158569397862400,"{'username': 'WilliamWilde2', 'id': 1713079837...",1,1,1,...,[{'previewUrl': 'https://pbs.twimg.com/media/E...,,,1.239153e+18,"{'username': '_roedel', 'id': 227067264, 'disp...","[{'username': '_roedel', 'id': 227067264, 'dis...",,,['7dag'],
4,4,https://twitter.com/haarlem/status/12387197713...,2020-03-14 06:53:02+00:00,Mooi Brood geeft collega-ondernemers korting i...,Mooi Brood geeft collega-ondernemers korting i...,1238719771350159365,"{'username': 'haarlem', 'id': 17230430, 'displ...",0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3340,3340,https://twitter.com/sphilips16/status/14890597...,2022-02-03 02:14:44+00:00,Wim Voermans - Als de spoedwet niet was verlen...,Wim Voermans - Als de spoedwet niet was verlen...,1489059748460875777,"{'username': 'sphilips16', 'id': 1421995456591...",0,0,0,...,,,,,,,,,,
3341,3341,https://twitter.com/HiTMaNK84/status/148893082...,2022-02-02 17:42:26+00:00,@AmstelCharlotte #HetIsKlaar\nStatus: 02-02-20...,@AmstelCharlotte #HetIsKlaar\nStatus: 02-02-20...,1488930824942723072,"{'username': 'HiTMaNK84', 'id': 97141650269502...",0,0,1,...,[{'previewUrl': 'https://pbs.twimg.com/media/F...,,,1.487899e+18,"{'username': 'AmstelCharlotte', 'id': 10588291...","[{'username': 'AmstelCharlotte', 'id': 1058829...",,,"['HetIsKlaar', 'corona', 'mondkapje', 'CTB', '...",
3342,3342,https://twitter.com/Jeroen_Cee/status/14886311...,2022-02-01 21:51:49+00:00,@hbvl Vandaag geleerd in @hbvl : 22/02/2022 is...,@hbvl Vandaag geleerd in @hbvl : 22/02/2022 is...,1488631198456893448,"{'username': 'Jeroen_Cee', 'id': 250107921, 'd...",0,0,0,...,,,,1.488497e+18,"{'username': 'hbvl', 'id': 17918249, 'displayn...","[{'username': 'hbvl', 'id': 17918249, 'display...",,,,
3343,3343,https://twitter.com/Hageneesje51/status/147897...,2022-01-06 06:20:44+00:00,@lientje1967 @telegraaf Ik ben 2x gevaccineerd...,@lientje1967 @telegraaf Ik ben 2x gevaccineerd...,1478974796964802560,"{'username': 'Hageneesje51', 'id': 2325242305,...",0,0,0,...,,,,1.478712e+18,"{'username': 'lientje1967', 'id': 118084079, '...","[{'username': 'lientje1967', 'id': 118084079, ...",,,,


In [81]:
concat_tweets_df = concat_dfs_from_files(get_all_filenames(data_path, file_path))

data
data/maatregelen_tweets_per_persco_7dagen_*


In [86]:
save_df(concat_tweets_df, 'data/concatenated_tweets.csv')

In [88]:
tweets = pd.read_csv('data/concatenated_tweets.csv')
tweets

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,url,date,content,renderedContent,id,user,replyCount,retweetCount,...,media,retweetedTweet,quotedTweet,inReplyToTweetId,inReplyToUser,mentionedUsers,coordinates,place,hashtags,cashtags
0,0,0,https://twitter.com/JeroenBley1985/status/1484...,2022-01-20 23:54:38+00:00,@steeph Wat nu als de lockdown/maatregelen ook...,@steeph Wat nu als de lockdown/maatregelen ook...,1.484313e+18,"{'username': 'JeroenBley1985', 'id': 960258626...",0.0,0,...,,,,1.484210e+18,"{'username': 'steeph', 'id': 7692762, 'display...","[{'username': 'steeph', 'id': 7692762, 'displa...",,,,
1,1,1,https://twitter.com/notjacksparrow1/status/148...,2022-01-20 23:43:04+00:00,❗️#SarsCov2 van de A-lijst\n❗️Schrappen Corona...,❗️#SarsCov2 van de A-lijst\n❗️Schrappen Corona...,1.484311e+18,"{'username': 'notjacksparrow1', 'id': 10853133...",0.0,0,...,,,,,,,,,"['SarsCov2', 'EindeLockdownNu', 'GeenG', 'Geen...",
2,2,2,https://twitter.com/Lightpoint_Mark/status/148...,2022-01-20 23:35:19+00:00,@stemvooralvvd @Bouli21 @paulstolk Natuurlijk ...,@stemvooralvvd @Bouli21 @paulstolk Natuurlijk ...,1.484309e+18,"{'username': 'Lightpoint_Mark', 'id': 24662182...",1.0,0,...,,,,1.484306e+18,"{'username': 'stemvooralvvd', 'id': 1436395634...","[{'username': 'stemvooralvvd', 'id': 143639563...",,,,
3,3,3,https://twitter.com/mkeulemans/status/14843082...,2022-01-20 23:33:47+00:00,Briljante grafiek van @steeph: ziekenhuislast ...,Briljante grafiek van @steeph: ziekenhuislast ...,1.484308e+18,"{'username': 'mkeulemans', 'id': 15200788, 'di...",29.0,36,...,,,{'url': 'https://twitter.com/steeph/status/148...,,,"[{'username': 'steeph', 'id': 7692762, 'displa...",,,,
4,4,4,https://twitter.com/59_infinity/status/1484306...,2022-01-20 23:26:38+00:00,@DrDavidNL @fmeeus1 Laten we hopen dat je geli...,@DrDavidNL @fmeeus1 Laten we hopen dat je geli...,1.484306e+18,"{'username': '59_infinity', 'id': 115414385740...",1.0,0,...,,,,1.484299e+18,"{'username': 'DrDavidNL', 'id': 14732481760229...","[{'username': 'DrDavidNL', 'id': 1473248176022...",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396035,396035,3318,https://twitter.com/Drklwrs/status/14260819051...,2021-08-13 07:23:16+00:00,@Krishnafred @IAsonips @gringo_mt COVID-19 maa...,@Krishnafred @IAsonips @gringo_mt COVID-19 maa...,1.426082e+18,"{'username': 'Drklwrs', 'id': 290172519, 'disp...",1.0,0,...,,,,1.426079e+18,"{'username': 'Krishnafred', 'id': 1413448424, ...","[{'username': 'Krishnafred', 'id': 1413448424,...",,,,
396036,396036,3319,https://twitter.com/BeMilInterests/status/1426...,2021-08-13 06:52:19+00:00,@FrankNolf @AllaryMario @dhert_k @BelgiumDefen...,@FrankNolf @AllaryMario @dhert_k @BelgiumDefen...,1.426074e+18,"{'username': 'BeMilInterests', 'id': 133949716...",1.0,1,...,,,,1.425902e+18,"{'username': 'FrankNolf', 'id': 2195760446, 'd...","[{'username': 'FrankNolf', 'id': 2195760446, '...",,,,
396037,396037,3320,https://twitter.com/EllenHnsn/status/142606161...,2021-08-13 06:02:38+00:00,@jellehiemstra7 De bokaal voor opruiing gaat n...,@jellehiemstra7 De bokaal voor opruiing gaat n...,1.426062e+18,"{'username': 'EllenHnsn', 'id': 1922718913, 'd...",0.0,1,...,,,,1.426004e+18,"{'username': 'jellehiemstra7', 'id': 313361435...","[{'username': 'jellehiemstra7', 'id': 31336143...",,,"['Ab', 'Tweedeling', 'Vaccinatie', 'maatregele...",
396038,396038,3321,https://twitter.com/Quesitum/status/1426010142...,2021-08-13 02:38:06+00:00,@lidwienj Nu begrijp je gelijk waarom zij zo b...,@lidwienj Nu begrijp je gelijk waarom zij zo b...,1.426010e+18,"{'username': 'Quesitum', 'id': 110863350213474...",0.0,0,...,[{'thumbnailUrl': 'https://pbs.twimg.com/tweet...,,,1.425810e+18,"{'username': 'lidwienj', 'id': 42186084, 'disp...","[{'username': 'lidwienj', 'id': 42186084, 'dis...",,,['Covid_19'],
