## 1. Import packages

In [1]:
#!pip install tweepy

In [1]:
import os
import tweepy as tw
import pandas as pd
from tqdm import tqdm, notebook

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

## 2. Twitter API authentication

In [None]:
consumer_api_key = os.environ["TWITTER_CONSUMER_API_KEY"]
consumer_api_secret = os.environ["TWITTER_CONSUMER_API_SECRET"]

In [3]:
auth = tw.OAuthHandler(consumer_api_key, consumer_api_secret)

In [4]:
api = tw.API(auth, wait_on_rate_limit=True)

## 3. Tweets query

### 3.1. Define the query

In [5]:
search_words = "#covid19 -filter:retweets"
date_since = "2020-03-01"
# Collect tweets
tweets = tw.Cursor(api.search,
              q=search_words,
              lang="en",
              since=date_since).items(12452)

### 3.2. Retreive the tweets

In [6]:
tweets_copy = []
for tweet in tqdm(tweets):
     tweets_copy.append(tweet)

12452it [16:07, 12.87it/s] 


In [7]:
print(f"new tweets retrieved: {len(tweets_copy)}")

new tweets retrieved: 12452


## 4. Populate the dataset

In [8]:
tweets_df = pd.DataFrame()
for tweet in tqdm(tweets_copy):
    hashtags = []
    try:
        for hashtag in tweet.entities["hashtags"]:
            hashtags.append(hashtag["text"])
        text = api.get_status(id=tweet.id, tweet_mode='extended').full_text
    except:
        pass
    tweets_df = tweets_df.append(pd.DataFrame({'user_name': tweet.user.name, 
                                               'user_location': tweet.user.location,\
                                               'user_description': tweet.user.description,
                                               'user_created': tweet.user.created_at,
                                               'user_followers': tweet.user.followers_count,
                                               'user_friends': tweet.user.friends_count,
                                               'user_favourites': tweet.user.favourites_count,
                                               'user_verified': tweet.user.verified,
                                               'date': tweet.created_at,
                                               'text': text, 
                                               'hashtags': [hashtags if hashtags else None],
                                               'source': tweet.source,
                                               'is_retweet': tweet.retweeted}, index=[0]))

100%|██████████| 12452/12452 [3:24:19<00:00,  1.02it/s]


In [10]:
tweets_df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,"Jacquelyn Jhingree, Ph.D.","Canada, UK, Guyana 🌎","Chemistry PhD - @OfficialUoM,\nScientist in #Biotech - Vaccines & Drugs \n#massspectrometry #STEM👩🏽‍🔬 \nSoccer, Travel, Culture, Social Issues.\nViews my own.",2013-06-24 01:57:20,723,1712,8551,False,2020-08-30 20:31:26,"Disappointing that FDA Head seems to be acceding to political intervention. \nSuper risky to approve a #vaccine that hasn’t undergone the full course of clinical trials, with scientific rigour, to assess safety and efficacy. \nVaccine dev. usually takes 5-10 yrs! \n#COVID19 https://t.co/vYMdUfe1c7",[vaccine],Twitter for iPhone,False
0,Liu Xiaoming,"49/51 Portland Place, London",Chinese Ambassador to the UK. \nEmbassy account: @ChineseEmbinUK,2019-10-09 14:18:32,82103,14,48,True,2020-08-30 20:31:20,China's economy is rebounding and heralds a promising future thanks to effective #COVID19 prevention efforts. The international community has confidence in China’s economic growth and long-term development. https://t.co/BZMVkHc7Ya,[COVID19],Twitter Web App,False
0,Blues News Media,,Blues News Media wants to save the world by calling out the global oppressors. Join and help share information!,2020-02-18 15:19:55,41,90,466,False,2020-08-30 20:31:18,As we fast approach the fall and winter seasons the fear surrounding the #coronavirus will be ramped up again. Familiarize yourselves with what exactly is to come in this 2010 Rockefeller paper commonly know as Lock Step.\n#COVID19\n\nhttps://t.co/xzhgFDWewp,[coronavirus],Twitter Web App,False
0,Amesh Adalja,"Pittsburgh, Baltimore, NYC","Infectious disease MD working on pandemic policy, emerging infections, preventing bioterror https://t.co/Xnr2JIetFE",2010-12-16 20:41:15,31716,2008,2108,True,2020-08-30 20:31:07,In this @MSNBC interview with @LindseyReiser I discuss various aspects of #COVID19 including vaccines and Europe's experience https://t.co/eAgceMGtdR https://t.co/gSnlY3LPfi,[COVID19],Twitter Web App,False
0,InHisGraceITrust,Central fLORiDa :),"#WhoSoEver ,YOU MUST Know #JesusIs ♫♪♥♫ #HopeLives #Meditate 📘#itsNOTaboutSIN 🕯️😇 #GRACE #TRUTH #LOVE Mysteries of HisWord ⚔️ARE YOURS! #Jn317 🙌🏽🤗🤟🏽🦋🕊",2009-06-18 20:03:30,12132,12196,21548,False,2020-08-30 20:31:06,@travis_view @J_wisecrack Yale’s Study tests different messages about vaccinating against COVID-19 once the vaccine becomes available:\n\nhttps://t.co/3s9tpugSys\n\n#FabricatedNumbers #CoVid19 #PeopleMatter #BusinessofMedicalField\n#BusinessOfJusticeSystem\n#JusticeForAmerica \n#AmericaTheBrave🩸💧🇺🇸💚⚖️🔥👑🔥 https://t.co/f6vwOnLaFe,,Twitter for iPhone,False


## 5. Save the data

### 5.1. Read past data

In [11]:
tweets_old_df = pd.read_csv("covid19_tweets.csv")
print(f"past tweets: {tweets_old_df.shape}")

past tweets: (168264, 13)


  interactivity=interactivity, compiler=compiler, result=result)


### 5.2. Merge past and present data

In [12]:
tweets_all_df = pd.concat([tweets_old_df, tweets_df], axis=0)
print(f"new tweets: {tweets_df.shape[0]} past tweets: {tweets_old_df.shape[0]} all tweets: {tweets_all_df.shape[0]}")

new tweets: 12452 past tweets: 168264 all tweets: 180716


### 5.3. Drop duplicates

In [13]:
tweets_all_df.drop_duplicates(subset = ["user_name", "date", "text"], inplace=True)
print(f"all tweets: {tweets_all_df.shape}")

all tweets: (180716, 13)


### 5.4. Export the updated data

In [14]:
tweets_all_df.to_csv("covid19_tweets.csv", index=False)