# NLP COVID Vax Data

In [1]:
import pandas as pd
import numpy as np

pd.set_option("max_colwidth", 0)
pd.set_option('max_columns', 35)
np.set_printoptions(suppress=True) # Suppress scientific notation where possible

## Import & combine tweetIDs
Tweet IDs come from this dataset: https://github.com/gmuric/avax-tweets-dataset; the authors obtained them by searching twitter for anti-COVID-19 vaccine-related hashtags between October 2020 and April 2021

In [2]:
def combine_tweetIDs(year_month, days_in_month):
    ''' Imports text files containing tweetIDs and combines a month-worth of 
    tweetIDs into a single df
    '''
    tweet_list = []
    for day in range(1, days_in_month + 1):
        if day < 10:
            day = '0' + str(day)
        else:
            day = str(day)
        for hour in range(0,24):
            if hour < 10:
                hour = '0' + str(hour)
            else:
                hour = str(hour)
            try:
                link = ('avax-tweets-dataset/streaming-tweetids/' 
                            + year_month + '/'
                           + year_month + '-'
                            + day + '-'
                            +hour +'.txt'
                               )
                new_tweets = pd.read_csv(link, header=None)
                tweet_list.append(new_tweets)  
            except FileNotFoundError:
                print('File not found')
    return pd.concat(tweet_list, ignore_index=True)

### Collecting tweetIDs 1 month at a time & exporting each to Hydrator app

In [3]:
# November 2020

tweetIDs_2020_11 = combine_tweetIDs(year_month='2020-11', days_in_month=30)
tweetIDs_2020_11.head(3)

Unnamed: 0,0
0,1322795488928673792
1,1322795535565156353
2,1322795640275968000


In [4]:
tweetIDs_2020_11.shape

(202897, 1)

In [5]:
# exporting November 2020 tweetIDs

tweetIDs_2020_11.to_csv('my_twitter_data/tweetIDs_2020_11.csv', header=False, index=False)

In [6]:
# December 2020

tweetIDs_2020_12 = combine_tweetIDs(year_month='2020-12', days_in_month=31)
tweetIDs_2020_12.shape

(269915, 1)

In [7]:
# two rows in Dec 2020 say 'tweetid' instead of an id number

tweetIDs_2020_12[tweetIDs_2020_12[0] == 'tweetid']

Unnamed: 0,0
209554,tweetid
236997,tweetid


In [8]:
# dropping the 'tweetid' rows

tweetIDs_2020_12 = tweetIDs_2020_12.drop([209554, 236997])

In [9]:
tweetIDs_2020_12.shape

(269913, 1)

In [10]:
# exporting December 2020 tweetIDs

tweetIDs_2020_12.to_csv('my_twitter_data/tweetIDs_2020_12.csv', header=False, index=False)

In [11]:
# January 2021
# one file (17th at 15:00) was missing - so used try/except to skip it

tweetIDs_2021_01 = combine_tweetIDs(year_month='2021-01', days_in_month=31)
tweetIDs_2021_01.shape

File not found


(179787, 1)

In [12]:
# exporting Jan 2021 tweetIDs

tweetIDs_2021_01.to_csv('my_twitter_data/tweetIDs_2021_01.csv', header=False, index=False)

In [13]:
# February 2021

tweetIDs_2021_02 = combine_tweetIDs(year_month='2021-02', days_in_month=28)
tweetIDs_2021_02.shape

(202916, 1)

In [14]:
# exporting Feb 2021 tweetIDs

tweetIDs_2021_02.to_csv('my_twitter_data/tweetIDs_2021_02.csv', header=False, index=False)

In [15]:
# March 2021
# one file not found

tweetIDs_2021_03 = combine_tweetIDs(year_month='2021-03', days_in_month=31)
tweetIDs_2021_03.shape

File not found


(419432, 1)

In [16]:
# exporting March 2021 tweetIDs

tweetIDs_2021_03.to_csv('my_twitter_data/tweetIDs_2021_03.csv', header=False, index=False)

In [17]:
# April 2021 - dataset ends after the 20th

tweetIDs_2021_04 = combine_tweetIDs(year_month='2021-04', days_in_month=20)
tweetIDs_2021_04.shape

(484372, 1)

In [18]:
# exporting April 2021 tweetIDs

tweetIDs_2021_04.to_csv('my_twitter_data/tweetIDs_2021_04.csv', header=False, index=False)

## Combining hydrated Tweets 
Tweets have now been 'hydrated' by running through the Hydrator app: https://github.com/DocNow/hydrator


### Importing hydrated Tweets 

Some definitions:  
>- Quote_id only surfaces when the Tweet is a quote Tweet. The field contains the Tweet ID of the quoted Tweet  
>- Re-tweet_count = number of times this Tweet has been retweeted

In [19]:
# Re-importing hydrated November 2020 data

hydrated_2020_11 = pd.read_csv('my_twitter_data/hydrated_2020_11.csv')
hydrated_2020_11.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,lang,place,possibly_sensitive,quote_id,retweet_count,retweet_id,retweet_screen_name,source,text,tweet_url,user_created_at,user_id,user_default_profile_image,user_description,user_favourites_count,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name,user_statuses_count,user_time_zone,user_urls,user_verified
0,,Sun Nov 01 07:04:54 +0000 2020,,,https://twitter.com/LotusOak2/status/1322642287449591814,0,1322796743298265088,,,,en,,False,1.322642e+18,0,,,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",Stay away from that vaccine. Population control. https://t.co/DTMFGsNRzX,https://twitter.com/BAChatwin/status/1322796743298265088,Mon Sep 19 17:13:57 +0000 2011,376304718,False,☀️🇨🇦🌍Truth☀️🇨🇦🌍 Freedom ☀️🇨🇦🌏Health ☀️🇨🇦🌍Eyes wide open\nAwake patriot and CF veteran (13+yrs),1643,130,658,0,"Niagara, Ontario",Barb Chatwin,BAChatwin,2659,,,False
1,,Sun Nov 01 07:10:02 +0000 2020,,,,0,1322798033596272640,,,,en,,,,489,1.322434e+18,sunnynwaobi1,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","Isn't this a systematic depopulation happing in Obigbo?, where young unarmed citizens are randomly picked, some tortured while some are killed. #Obigbomassacre @StateDept @AmnestyNigeria @ https://t.co/TsakU0l3Fm",https://twitter.com/jumoke_ladan/status/1322798033596272640,Wed Jul 01 19:30:15 +0000 2009,52824297,False,Am a Christian by God's grace. Called by Christ to serve and love Him as a Catholic. Perpetually loving Him.,7009,224,359,1,"Lagos, Nigeria",Olajumoke Ladan,jumoke_ladan,1812,,,False
2,,Sun Nov 01 07:13:25 +0000 2020,,,,0,1322798887694905346,,,,en,,,,122,1.322792e+18,conspiracyb0t,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",Bill Gates is one of the biggest proponents of mass depopulation.,https://twitter.com/wambo161/status/1322798887694905346,Fri Apr 08 16:05:41 +0000 2011,279106686,False,Follower of Jesus.,27413,205,711,3,la montaña de roble,James of Oak Mountain,wambo161,48229,,,False
3,,Sun Nov 01 07:31:24 +0000 2020,,,,0,1322803412300431361,,,,en,,,1.322434e+18,41,1.322774e+18,Onyinye51838197,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",The world should be on notice that Nelson nwike river state governor and the Nigeria army are committing genocide and pogrom in Biafra land https://t.co/fTC8gNNSsZ,https://twitter.com/EmmanuelNwanyim/status/1322803412300431361,Tue Apr 21 07:16:59 +0000 2020,1252496422865375238,False,i was born to shine take it or live it...blessed son by God in heaven..🙌🏼🙌🏼🙌🏼..peace 👍🏻 ...Biafran citizen Israel 🇮🇱 citizen,8947,121,307,1,Israel,Emmanuel nwanyim,EmmanuelNwanyim,16345,,,False
4,,Sun Nov 01 07:01:51 +0000 2020,,,,1,1322795973337403395,bongzmessi,1.32217e+18,217711495.0,en,,,,0,,,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",@bongzmessi To come to think of it perhaps this was done so they multiply and outnumber us but. To them the depopulation agenda is at top of their list. Phela my King bake baphahluke labantu telling things they were not supposed like the one on an interview that they injected AIDS to blacks,https://twitter.com/Boniswa63207956/status/1322795973337403395,Mon May 25 12:43:06 +0000 2020,1264899714950062080,False,Mayibuye iAfrika✊🏾. \n\nI don't call my people Kings and Queens to tickle my throat it's because we are Royalty 👸🏾🤴🏿.\nTribalism 🤮Feminism 🤮Xenophobia🤮,39392,3658,3680,2,,Boniswa,Boniswa63207956,40107,,,False


In [20]:
hydrated_2020_11.shape

(40764, 35)

In [21]:
# Re-importing hydrated December 2020 data

hydrated_2020_12 = pd.read_csv('my_twitter_data/hydrated_2020_12.csv')
hydrated_2020_12.shape

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


(84058, 35)

In [22]:
# Re-importing hydrated January 2021 data

hydrated_2021_01 = pd.read_csv('my_twitter_data/hydrated_2021_01.csv')
hydrated_2021_01.shape

(122966, 35)

In [23]:
# Re-importing hydrated February 2021 data

hydrated_2021_02 = pd.read_csv('my_twitter_data/hydrated_2021_02.csv')
hydrated_2021_02.shape

(144145, 35)

In [24]:
# Re-importing hydrated March 2021 data

hydrated_2021_03 = pd.read_csv('my_twitter_data/hydrated_2021_03.csv')
hydrated_2021_03.shape

(321807, 35)

In [25]:
# Re-importing hydrated April 2021 data

hydrated_2021_04 = pd.read_csv('my_twitter_data/hydrated_2021_04.csv')
hydrated_2021_04.shape

(386169, 35)

### Combining hydrated Tweets 

In [26]:
# concatenate dfs

vax_full = pd.concat([hydrated_2020_11, hydrated_2020_12, hydrated_2021_01, 
           hydrated_2021_02, hydrated_2021_03, hydrated_2021_04],
         ignore_index=True)
vax_full.head()

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,lang,place,possibly_sensitive,quote_id,retweet_count,retweet_id,retweet_screen_name,source,text,tweet_url,user_created_at,user_id,user_default_profile_image,user_description,user_favourites_count,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name,user_statuses_count,user_time_zone,user_urls,user_verified
0,,Sun Nov 01 07:04:54 +0000 2020,,,https://twitter.com/LotusOak2/status/1322642287449591814,0,1322796743298265088,,,,en,,False,1.322642e+18,0,,,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",Stay away from that vaccine. Population control. https://t.co/DTMFGsNRzX,https://twitter.com/BAChatwin/status/1322796743298265088,Mon Sep 19 17:13:57 +0000 2011,376304718,False,☀️🇨🇦🌍Truth☀️🇨🇦🌍 Freedom ☀️🇨🇦🌏Health ☀️🇨🇦🌍Eyes wide open\nAwake patriot and CF veteran (13+yrs),1643,130,658,0,"Niagara, Ontario",Barb Chatwin,BAChatwin,2659,,,False
1,,Sun Nov 01 07:10:02 +0000 2020,,,,0,1322798033596272640,,,,en,,,,489,1.322434e+18,sunnynwaobi1,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","Isn't this a systematic depopulation happing in Obigbo?, where young unarmed citizens are randomly picked, some tortured while some are killed. #Obigbomassacre @StateDept @AmnestyNigeria @ https://t.co/TsakU0l3Fm",https://twitter.com/jumoke_ladan/status/1322798033596272640,Wed Jul 01 19:30:15 +0000 2009,52824297,False,Am a Christian by God's grace. Called by Christ to serve and love Him as a Catholic. Perpetually loving Him.,7009,224,359,1,"Lagos, Nigeria",Olajumoke Ladan,jumoke_ladan,1812,,,False
2,,Sun Nov 01 07:13:25 +0000 2020,,,,0,1322798887694905346,,,,en,,,,122,1.322792e+18,conspiracyb0t,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",Bill Gates is one of the biggest proponents of mass depopulation.,https://twitter.com/wambo161/status/1322798887694905346,Fri Apr 08 16:05:41 +0000 2011,279106686,False,Follower of Jesus.,27413,205,711,3,la montaña de roble,James of Oak Mountain,wambo161,48229,,,False
3,,Sun Nov 01 07:31:24 +0000 2020,,,,0,1322803412300431361,,,,en,,,1.322434e+18,41,1.322774e+18,Onyinye51838197,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",The world should be on notice that Nelson nwike river state governor and the Nigeria army are committing genocide and pogrom in Biafra land https://t.co/fTC8gNNSsZ,https://twitter.com/EmmanuelNwanyim/status/1322803412300431361,Tue Apr 21 07:16:59 +0000 2020,1252496422865375238,False,i was born to shine take it or live it...blessed son by God in heaven..🙌🏼🙌🏼🙌🏼..peace 👍🏻 ...Biafran citizen Israel 🇮🇱 citizen,8947,121,307,1,Israel,Emmanuel nwanyim,EmmanuelNwanyim,16345,,,False
4,,Sun Nov 01 07:01:51 +0000 2020,,,,1,1322795973337403395,bongzmessi,1.32217e+18,217711495.0,en,,,,0,,,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",@bongzmessi To come to think of it perhaps this was done so they multiply and outnumber us but. To them the depopulation agenda is at top of their list. Phela my King bake baphahluke labantu telling things they were not supposed like the one on an interview that they injected AIDS to blacks,https://twitter.com/Boniswa63207956/status/1322795973337403395,Mon May 25 12:43:06 +0000 2020,1264899714950062080,False,Mayibuye iAfrika✊🏾. \n\nI don't call my people Kings and Queens to tickle my throat it's because we are Royalty 👸🏾🤴🏿.\nTribalism 🤮Feminism 🤮Xenophobia🤮,39392,3658,3680,2,,Boniswa,Boniswa63207956,40107,,,False


In [27]:
# almost 1.1 million rows of tweets

vax_full.shape

(1099909, 35)

## Exploring the combined dataset 

In [28]:
vax_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1099909 entries, 0 to 1099908
Data columns (total 35 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   coordinates                 239 non-null      object 
 1   created_at                  1099909 non-null  object 
 2   hashtags                    163543 non-null   object 
 3   media                       85056 non-null    object 
 4   urls                        240801 non-null   object 
 5   favorite_count              1099909 non-null  int64  
 6   id                          1099909 non-null  int64  
 7   in_reply_to_screen_name     234118 non-null   object 
 8   in_reply_to_status_id       225351 non-null   float64
 9   in_reply_to_user_id         234118 non-null   float64
 10  lang                        1099909 non-null  object 
 11  place                       8677 non-null     object 
 12  possibly_sensitive          313108 non-null   object 
 1

In [29]:
# convert quote_id and retweet_id to int (in order to suppress scientific notation)

vax_full['quote_id'] = vax_full['quote_id'].astype('Int64')
vax_full['retweet_id'] = vax_full['retweet_id'].astype('Int64')

### Retweets 

In [30]:
# more than half the 1.1 million tweets are re-tweets

(~(vax_full['retweet_id'].isnull())).sum()

640621

In [31]:
vax_full['retweet_id'].nunique()

75980

In [32]:
# the most common retweets

vax_full.groupby('retweet_id')['retweet_id'].count(). \
sort_values(ascending=False).head(10)

retweet_id
1379184293138857984    27902
1346602188953559040    20326
1379832806902206464    14516
1366957856323469312    6836 
1379200046957215744    6743 
1370200113977700352    5376 
1342085913782923264    4683 
1375160759324352512    3972 
1349810559986839552    3962 
1380906061813661696    3506 
Name: retweet_id, dtype: int64

In [33]:
# looks like retweeet text is all identical - makes sense
# not that this tweet w/ 27K re-tweets is actually pro-vaccine
# keyword was probably "vaxxed"

vax_full[vax_full['retweet_id'] == 1379184293138857984]

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,lang,place,possibly_sensitive,quote_id,retweet_count,retweet_id,retweet_screen_name,source,text,tweet_url,user_created_at,user_id,user_default_profile_image,user_description,user_favourites_count,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name,user_statuses_count,user_time_zone,user_urls,user_verified
788525,,Mon Apr 05 21:30:18 +0000 2021,,,,0,1379184647670730753,,,,en,,,,36735,1379184293138857984,GrillmoreSlim,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This lady on the plane said ""vaxxed and waxed, baby, I'm ready for some action on this trip."" and that should be everybody's vibe this summer.",https://twitter.com/lilakeys21/status/1379184647670730753,Tue Jul 27 01:28:03 +0000 2010,171298581,False,,2293,1297,1104,3,Jersey,The Cunt Conductor,lilakeys21,99162,,,False
788526,,Mon Apr 05 21:33:34 +0000 2021,,,,0,1379185470442201097,,,,en,,,,36735,1379184293138857984,GrillmoreSlim,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This lady on the plane said ""vaxxed and waxed, baby, I'm ready for some action on this trip."" and that should be everybody's vibe this summer.",https://twitter.com/50sombrasdegris/status/1379185470442201097,Mon May 12 22:42:57 +0000 2014,2543923811,False,becoming my better self / 🇵🇷,13477,593,443,3,enrique 💍,grey 🌪,50sombrasdegris,20423,,,False
788539,,Mon Apr 05 21:31:23 +0000 2021,,,,0,1379184920153649153,,,,en,,,,36735,1379184293138857984,GrillmoreSlim,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This lady on the plane said ""vaxxed and waxed, baby, I'm ready for some action on this trip."" and that should be everybody's vibe this summer.",https://twitter.com/trusttheunseen/status/1379184920153649153,Tue Jul 31 02:40:25 +0000 2012,727391077,False,Simply blessed.,12505,922,865,0,,E,trusttheunseen,23382,,,False
788545,,Mon Apr 05 21:32:04 +0000 2021,,,,0,1379185090123694083,,,,en,,,,36735,1379184293138857984,GrillmoreSlim,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This lady on the plane said ""vaxxed and waxed, baby, I'm ready for some action on this trip."" and that should be everybody's vibe this summer.",https://twitter.com/_FefeXO/status/1379185090123694083,Thu Jul 02 02:40:10 +0000 2009,52942895,False,25| cerified lover girl. 🏳️‍🌈🇵🇷 | Cancer 🌞Leo 🌛Capricorn Asc,33196,941,507,10,"Texas, USA",CHAMPAGNE MAMÍ✨,_FefeXO,94389,,,False
788549,,Mon Apr 05 21:29:44 +0000 2021,,,,0,1379184504837902337,,,,en,,,,36735,1379184293138857984,GrillmoreSlim,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This lady on the plane said ""vaxxed and waxed, baby, I'm ready for some action on this trip."" and that should be everybody's vibe this summer.",https://twitter.com/BryanVillone/status/1379184504837902337,Thu Jan 29 17:21:23 +0000 2009,19721340,False,Stand Up Comedian. Cheapskate. IG: BryanVillone,191,399,200,9,NJ,Global megastar. Hero to many peasant.,BryanVillone,28458,,https://www.eventbrite.com/e/down-to-the-last-bit-tickets-167885274375?ref=eios,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1082848,,Tue Apr 20 00:47:58 +0000 2021,,,,0,1384307823417303041,,,,en,,,,36735,1379184293138857984,GrillmoreSlim,"<a href=""http://www.echofon.com/"" rel=""nofollow"">Echofon</a>","This lady on the plane said ""vaxxed and waxed, baby, I'm ready for some action on this trip."" and that should be everybody's vibe this summer.",https://twitter.com/drollpatrol/status/1384307823417303041,Thu Dec 17 16:26:27 +0000 2009,97475545,False,Black Lives Matter (https://t.co/IVcaM20Hrc)\n\nSocial Distancing Expert\n\nKing Lear Act 4 Scene 6 Line 60\n\nShe/her\n\nCashApp $DrollPatrol\n\n\n(Painting by Kyra Kendall),18183,410,1348,38,Inner World,Delagrammatikas,drollpatrol,13632,,,False
1083946,,Tue Apr 20 03:01:09 +0000 2021,,,,0,1384341339702235143,,,,en,,,,36735,1379184293138857984,GrillmoreSlim,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This lady on the plane said ""vaxxed and waxed, baby, I'm ready for some action on this trip."" and that should be everybody's vibe this summer.",https://twitter.com/kaushiiveeno/status/1384341339702235143,Fri Apr 01 20:23:42 +0000 2011,275697595,False,can neither confirm or deny that im a gemini,46157,279,193,2,tor-iunno,♔ kay ♔,kaushiiveeno,20926,,,False
1085584,,Tue Apr 20 07:08:29 +0000 2021,,,,0,1384403582158479362,,,,en,,,,36735,1379184293138857984,GrillmoreSlim,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","This lady on the plane said ""vaxxed and waxed, baby, I'm ready for some action on this trip."" and that should be everybody's vibe this summer.",https://twitter.com/whathefaz/status/1384403582158479362,Mon Aug 16 06:05:35 +0000 2010,178989564,False,Positive outcomes only. #CODEBLACKMY\n#PelajarBukanPengantin #SchoolNotSpouse #MigranJugaManusia #PrayforPalestine #PrayforMyanmar #HutanPergiMana,205767,553,94,3,,Faz,whathefaz,153716,,http://instagram.com/faz367_,False
1090387,,Tue Apr 20 16:26:06 +0000 2021,,,,0,1384543908655517698,,,,en,,,,36735,1379184293138857984,GrillmoreSlim,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This lady on the plane said ""vaxxed and waxed, baby, I'm ready for some action on this trip."" and that should be everybody's vibe this summer.",https://twitter.com/trcpicalove/status/1384543908655517698,Fri Sep 22 22:14:24 +0000 2017,911353010957246466,False,i solve my problems by blatantly ignoring them and going on the internet,3753,16,100,0,she/her | 23 | hungary,mónika 🥀,trcpicalove,8927,,,False


### Quote tweets 

In [34]:
# 292K out of 1.1 million tweets are quotes

(~(vax_full['quote_id'].isnull())).sum()

292572

In [35]:
# 156K are both quotes and retweets

((~(vax_full['quote_id'].isnull())) & \
(~(vax_full['retweet_id'].isnull()))).sum()

156634

In [36]:
# the most common quote tweets

vax_full.groupby('quote_id')['quote_id'].count(). \
sort_values(ascending=False).head(10)

quote_id
1346525695879614464    20486
1372504222248415232    3809 
1377647623339905024    2990 
1342085913782923264    2815 
1379184293138857984    2400 
1333129526826721280    2160 
1362783010597310464    2072 
1346602188953559040    1879 
1370735174137298944    1777 
1380606469004009472    1676 
Name: quote_id, dtype: int64

In [37]:
# looks like quote tweets are sometimes identical, but sometimes not

vax_full[vax_full['quote_id'] == 1346525695879614464].head(10)

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,lang,place,possibly_sensitive,quote_id,retweet_count,retweet_id,retweet_screen_name,source,text,tweet_url,user_created_at,user_id,user_default_profile_image,user_description,user_favourites_count,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name,user_statuses_count,user_time_zone,user_urls,user_verified
137165,,Tue Jan 05 20:17:37 +0000 2021,,,https://twitter.com/therecount/status/1346525695879614467,23,1346551449199775746,,,,en,,False,1346525695879614464,4,,,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>","Yeah, because what you *REALLY* want is unvaccinated people handling the food supply because they don't have citizenship paperwork. 🙄🙄🙄🙄🙄 https://t.co/4gfG58eEK1",https://twitter.com/RGibsongirl/status/1346551449199775746,Fri Nov 20 20:57:15 +0000 2009,91423884,False,Corset scholar; bioanth PhD; chaotic neutral--harlequin aspect; bibliophile; tattoo collector; advocate for the dead; robot sex analyst; bad example for hire.,117226,5616,6176,16,,Dr. Rebecca Gibson,RGibsongirl,42698,,http://amazon.com/author/rebeccagibsonbioanthro,False
137185,,Tue Jan 05 20:22:37 +0000 2021,,,,0,1346552706006040576,,,,en,,,1346525695879614464,4,1.3465514491997755e+18,RGibsongirl,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","Yeah, because what you *REALLY* want is unvaccinated people handling the food supply because they don't have citizenship paperwork. 🙄🙄🙄🙄🙄 https://t.co/4gfG58eEK1",https://twitter.com/Jerri_Lynn25/status/1346552706006040576,Sat Nov 09 03:16:22 +0000 2019,1193004107759529986,False,"“𝒯𝒽𝑒 𝓃𝑒𝓍𝓉 𝓉𝒾𝓂𝑒 𝓈𝑜𝓂𝑒𝑜𝓃𝑒‘𝓈 𝓉𝑒𝒶𝒸𝒽𝒾𝓃𝑔, 𝓌𝒽𝓎 𝒹𝑜𝓃‘𝓉 𝓎𝑜𝓊 𝑔𝑒𝓉 𝓉𝒶𝓊𝑔𝒽𝓉?“",109620,5219,2887,7,Here & There,JerriLynn,Jerri_Lynn25,72221,,,False
137199,,Tue Jan 05 20:17:56 +0000 2021,,,,0,1346551528849633280,,,,en,,,1346525695879614464,4,1.3465514491997755e+18,RGibsongirl,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>","Yeah, because what you *REALLY* want is unvaccinated people handling the food supply because they don't have citizenship paperwork. 🙄🙄🙄🙄🙄 https://t.co/4gfG58eEK1",https://twitter.com/SilviPastured/status/1346551528849633280,Sat Feb 22 17:32:01 +0000 2020,1231270345300160518,False,Settler focused on Earth Service via climate-resilient food security.\n\n#NoPrideInGenocide #LandBack\n\n#KeepItInTheGround #ClimateStrike #Divest #FoodNotLawns,192627,1335,2350,13,So-called Canada,Practice Courage,SilviPastured,103084,,,False
137201,,Tue Jan 05 20:18:33 +0000 2021,,,,0,1346551683598446598,,,,en,,,1346525695879614464,4,1.3465514491997755e+18,RGibsongirl,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","Yeah, because what you *REALLY* want is unvaccinated people handling the food supply because they don't have citizenship paperwork. 🙄🙄🙄🙄🙄 https://t.co/4gfG58eEK1",https://twitter.com/dalydes/status/1346551683598446598,Sun Apr 03 16:08:33 +0000 2011,276555350,False,Attempting to ally. #BLMTO #wetsuwetenstrong #MMIW #RightsnotRescue #Antifa Dish with One Spoon He/his,176444,892,4373,42,Toronto,mind that magnifies the smallest matter,dalydes,87929,,,False
137219,,Tue Jan 05 20:36:39 +0000 2021,,,,0,1346556235764535299,,,,en,,,1346525695879614464,43,1.346556164214055e+18,LouisatheLast,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",This is ridiculous. Do you want sick people handling your food? Do you think vaccines work so perfectly that unvaccinated workers won’t get vaccinated coworkers sick? What the fuck https://t.co/oBA9x2ji83,https://twitter.com/aboynamedart/status/1346556235764535299,Sun Nov 23 23:11:55 +0000 2008,17580293,False,"Team @erumors. He/His. @Racialicious Forever. Bylines: SyfyWire, Motherboard, Rolling Stone, Raw Story. Notorious anti-white racist.' -- Breitbart",44646,4103,4810,217,,Arturo R. Garcia,aboynamedart,153723,,http://racialicious.com,False
137225,,Tue Jan 05 20:37:37 +0000 2021,,,,0,1346556482683285504,,,,en,,,1346525695879614464,43,1.346556164214055e+18,LouisatheLast,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",This is ridiculous. Do you want sick people handling your food? Do you think vaccines work so perfectly that unvaccinated workers won’t get vaccinated coworkers sick? What the fuck https://t.co/oBA9x2ji83,https://twitter.com/SouthrnGothHick/status/1346556482683285504,Sun Jul 04 02:01:47 +0000 2010,162566080,False,"Paul\n\nbisexual with an aquarium. might be scotts irish?\n|\nstudied Latin & gender; now, studies biochem\n|\nlikes: peppers, snakes, & spiders\n|\nThey/Them/Y'all.",334002,1319,2679,2,Alabama,crass iron skillet,SouthrnGothHick,125234,,,False
137227,,Tue Jan 05 20:38:35 +0000 2021,,,,0,1346556723423801346,,,,en,,,1346525695879614464,43,1.346556164214055e+18,LouisatheLast,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is ridiculous. Do you want sick people handling your food? Do you think vaccines work so perfectly that unvaccinated workers won’t get vaccinated coworkers sick? What the fuck https://t.co/oBA9x2ji83,https://twitter.com/bunrxm/status/1346556723423801346,Tue Nov 24 00:36:59 +0000 2009,92157633,False,enby bun. they/them. 🐰🤓🤖,167918,1937,1945,9,¯\_(ツ)_/¯,rxbun,bunrxm,994275,,,False
137228,,Tue Jan 05 20:48:25 +0000 2021,,,,0,1346559198247727110,,,,en,,,1346525695879614464,43,1.346556164214055e+18,LouisatheLast,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",This is ridiculous. Do you want sick people handling your food? Do you think vaccines work so perfectly that unvaccinated workers won’t get vaccinated coworkers sick? What the fuck https://t.co/oBA9x2ji83,https://twitter.com/SamanthaEich/status/1346559198247727110,Sat Nov 05 05:26:19 +0000 2011,405338829,False,"librarian, couch potato, very enthusiastic talker | she/her",3924,131,251,1,a comfy chair,Samantha 🏳️‍🌈,SamanthaEich,5894,,,False
137234,,Tue Jan 05 20:36:22 +0000 2021,,,https://twitter.com/therecount/status/1346525695879614467,157,1346556164214054918,,,,en,,False,1346525695879614464,43,,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is ridiculous. Do you want sick people handling your food? Do you think vaccines work so perfectly that unvaccinated workers won’t get vaccinated coworkers sick? What the fuck https://t.co/oBA9x2ji83,https://twitter.com/LouisatheLast/status/1346556164214054918,Tue Dec 02 00:47:39 +0000 2008,17794636,False,"Big mean sweaty dyke just looking for trouble. Friend of Garak. Cis-ish, she/her, graphic designer, race traitor, obesity glorifier. Opinions all mine. 🌹🍞🌹",334406,33261,2127,236,,Louisa 🌈👭,LouisatheLast,193614,,,False
137235,,Tue Jan 05 20:37:34 +0000 2021,,,,0,1346556469362249728,,,,en,,,1346525695879614464,43,1.346556164214055e+18,LouisatheLast,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",This is ridiculous. Do you want sick people handling your food? Do you think vaccines work so perfectly that unvaccinated workers won’t get vaccinated coworkers sick? What the fuck https://t.co/oBA9x2ji83,https://twitter.com/ForthWyn/status/1346556469362249728,Sat May 30 16:24:25 +0000 2009,43551122,False,"Eve, 26, fantasy writer, web developer in training, Salford Uni graduate. She/her",41385,575,1958,35,England,📖Eve (she/her),ForthWyn,100540,,,False


### Language 

In [38]:
# most are in English, but not all

vax_full['lang'].value_counts()

en     991130
und    53757 
fr     12193 
es     7604  
ja     5112  
de     4842  
nl     4571  
tr     2266  
it     2249  
ar     1777  
pt     1572  
gu     1511  
in     1445  
pl     1412  
zh     882   
hi     778   
sv     709   
ca     536   
et     528   
el     522   
ko     423   
tl     415   
ht     383   
ru     359   
fi     345   
da     320   
cs     296   
ta     239   
sr     202   
ro     197   
iw     196   
sl     162   
no     136   
th     128   
cy     111   
te     105   
fa     89    
lt     88    
lv     64    
hu     61    
is     40    
eu     34    
ur     25    
bg     17    
uk     16    
ne     10    
mr     9     
vi     9     
kn     7     
am     6     
ml     6     
bn     5     
dv     4     
or     3     
pa     2     
ps     1     
Name: lang, dtype: int64

In [39]:
# 'und' means language was undetected
# seems like these tweets don't have much text

vax_full[vax_full['lang'] == 'und']

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,lang,place,possibly_sensitive,quote_id,retweet_count,retweet_id,retweet_screen_name,source,text,tweet_url,user_created_at,user_id,user_default_profile_image,user_description,user_favourites_count,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name,user_statuses_count,user_time_zone,user_urls,user_verified
17,,Sun Nov 01 07:09:57 +0000 2020,Nomask MaskOFF NoVaccine,https://twitter.com/Schille18336614/status/1322798014566703104/photo/1,,0,1322798014566703104,FOX29philly,1.322796e+18,1.478771e+07,und,,False,,0,,,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",@FOX29philly #Nomask #MaskOFF #NoVaccine https://t.co/55ZzqhJFo8,https://twitter.com/Schille18336614/status/1322798014566703104,Mon Sep 14 02:01:02 +0000 2020,1305325569815515136,False,„Sei ein Freund der Schwachen und liebe die Gerechtigkeit.“\n\n#Testboykott #NeinzumImpfzwang\n#RücktrittBundesregierung ⚔ #impfapartheid #Ungehorsam,7499,231,389,0,Linköping,Schiller ✝⚔⛪ 🇩🇪🇦🇹,Schille18336614,7577,,https://report24.news/,False
20,,Sun Nov 01 07:15:10 +0000 2020,,,https://twitter.com/sunnynwaobi1/status/1322434241829982208,0,1322799327044161537,ARISEtv,,1.087779e+09,und,,False,1322434241829982208,0,,,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",@ARISEtv https://t.co/n5y4J0oAlU,https://twitter.com/kenzykayoficial/status/1322799327044161537,Mon Jan 03 16:02:06 +0000 2011,233576830,False,SEO Specialist / music lover / Digital Marketer / Red Devil,124,339,898,3,Nigeria,KenzyKay ¦ 20-10-20💔,kenzykayoficial,5179,,,False
40,,Sun Nov 01 07:35:59 +0000 2020,,,https://twitter.com/conspiracyb0t/status/1322791741854937089,0,1322804564559499265,ashwani_mahajan,,2.156856e+08,und,,False,1322791741854937088,0,,,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",@ashwani_mahajan https://t.co/camEcwIpvo,https://twitter.com/devsr84/status/1322804564559499265,Fri Apr 23 07:18:45 +0000 2010,136185881,False,"Working on Ergonomics, IITian, Blogger, Tech Enthusiast, Wannabe Entrepreneur, Student of History & Geo-Politics ...",6391,704,4980,15,Mumbai,Devindra Singh,devsr84,55837,,https://www.twitter.com/devsr84,False
50,,Sun Nov 01 07:42:47 +0000 2020,EndSARS EndMilitaryBrutality EndPoliceBrutalityinNigera EndBadGoveranceInNigeria,,,0,1322806277760864256,,,,und,,,1322434241829982208,2,1322611480898973696,Bubblybee_Chi,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",#EndSARS\n#EndMilitaryBrutality \n#EndPoliceBrutalityinNigera \n#EndBadGoveranceInNigeria https://t.co/ykBOkVRRLc,https://twitter.com/IkeLObidike/status/1322806277760864256,Tue Feb 22 10:38:12 +0000 2011,255949363,False,"Chemical Engineer (PhD), Author (Shifting Sands), Pro-Life, Non-absent Dad, Football Analyst, Arsenal fan, Scrabbler, Pragmatists, Humanist, Christian.",8468,419,329,8,Johannesburg,Dr. Ike Obidike,IkeLObidike,12310,,,False
72,,Sun Nov 01 08:18:55 +0000 2020,,,https://twitter.com/ForcedAdoption1/status/1322810228988026880,0,1322815370332266499,hakki501,1.322810e+18,4.013243e+08,und,,False,1322810228988026880,0,,,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",https://t.co/H9H559xnhp,https://twitter.com/hakki501/status/1322815370332266499,Sun Oct 30 11:27:55 +0000 2011,401324299,False,"Nothing to boast of except the cross of Jesus Christ.\nSaved by His grace through faith. (Ephesians 2: 8,9)",26734,905,1377,50,Temporary residence: Earth. Pe,J.,hakki501,29961,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099727,,Wed Apr 21 06:27:00 +0000 2021,Mask Masks FaceMask FaceMasks FaceDiaper FaceDiapers MaskMandate MaskMandates MedicalFreedom,,https://www.instagram.com/p/CN6vxB0AjXd/?igshid=lqy30ohrdn9b,0,1384755531299045376,,,,und,,False,,0,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",#Mask #Masks #FaceMask #FaceMasks #FaceDiaper #FaceDiapers #MaskMandate #MaskMandates #MedicalFreedom https://t.co/naFkIu1W6F,https://twitter.com/JMichW87/status/1384755531299045376,Tue Jun 16 16:17:56 +0000 2020,1272926339696398336,True,,383,6,222,0,"Hollywood, FL",Jamie White,JMichW87,1639,,,False
1099790,,Wed Apr 21 06:39:45 +0000 2021,,,https://twitter.com/kurtwearshats/status/1384564021202882565,1,1384758740100861955,,,,und,"Irvine, CA",False,1384564021202882560,0,,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",😂😂😂😂😂 https://t.co/dlBkxWQyZ2,https://twitter.com/angel0bugatti/status/1384758740100861955,Sat May 12 19:28:45 +0000 2012,578330624,False,Middle-aged black man with sass. Big butt. Bigger heart. | Debater.,36628,268,930,4,"Irvine, CA",Angelo,angel0bugatti,46080,,https://vsco.co/angelo-bugatti,False
1099799,,Wed Apr 21 06:43:15 +0000 2021,,,https://montanadailygazette.com/2021/04/16/unvaccinated-women-report-miscarriages-after-interactions-with-vaccinated-people/,0,1384759617528340483,,,,und,,False,,0,,,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",https://t.co/rZmyXHcIZi,https://twitter.com/AlainBeydoun/status/1384759617528340483,Thu Mar 08 08:52:28 +0000 2012,518323576,False,,67,13,28,0,cairns,byblos cafe,AlainBeydoun,138,,http://bybloscafe.com.au,False
1099813,,Wed Apr 21 06:39:54 +0000 2021,,,,0,1384758774301224962,,,,und,,,1384564021202882560,49,1384738849180450816,KingJosiah54,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",😂😂😂😂😂😂 https://t.co/EuGgqxKscL,https://twitter.com/SpcMtn/status/1384758774301224962,Fri Apr 10 03:46:16 +0000 2009,30153135,False,"As long as I’m in Polo smiling, they think they got me But they would try to crack me if they ever see a black me.",34189,294,3370,8,somewhr in outerspace,Petty Mahomes,SpcMtn,35105,,,False


### Hashtags 

In [40]:
# most commonly used hashtags (although some of these may appear common because of retweets)
# some are clearly anti-vaccine, while others are neutral

vax_full['hashtags'].value_counts().head(30)

NoVaccineForMe                              4368
COVID19                                     4266
mybodymychoice                              3514
FireFauci WeWillNotComply                   3118
MyBodyMyChoice                              3018
InformedConsent                             2555
VaccineforSouthAfrica                       2197
BillGatesBioTerrorist                       2154
ArrestBillGates                             2027
NoVaccine                                   1893
IDoNotConsent                               1284
COVID                                       850 
depopulation                                837 
vaccine                                     700 
LongCovid                                   690 
BREAKING COVID19                            683 
GreatReset                                  675 
vaccines                                    637 
VAXXED                                      537 
arrestbillgates                             530 
ScottyDoesNothing   

## Clean dataset

In [41]:
# full dataset contains 45K unique tweets 

vax_full['text'].nunique()

457573

In [42]:
# dropping duplicate tweets

vax_tweets = vax_full.drop_duplicates(subset='text', keep='first')
vax_tweets.shape

(457573, 35)

In [43]:
# keeping only tweets that are in English

vax_tweets = vax_tweets[vax_tweets['lang'] == 'en']
vax_tweets.shape

(396605, 35)

In [44]:
vax_tweets.to_csv('vax_tweets.csv', index=False)