In [3]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [20]:
import yaml

with open("config.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)


In [67]:
### SETUP ###

import tweepy as tw
import pandas as pd
import datetime

api_creds = cfg['api_creds']
API_KEY = api_creds['API_KEY']
API_SECRET = api_creds['API_SECRET']
ACCESS_TOKEN = api_creds['ACCESS_TOKEN']
ACCESS_SECRET = api_creds['ACCESS_SECRET']

class TwitterApi:
    def __init__(self):
        self.api = None
        self.raw_tweets = []
        self.clean_tweets_df = pd.DataFrame()
        
        self.authenticate_api()
        self.check_rate_limit()
        
    def exception_handler(func):
        def inner_function(*args, **kwargs):
            try:
                func(*args, **kwargs)
            except TypeError:
                print(f"{func.__name__} error")
        return inner_function
        
    @exception_handler
    def authenticate_api(self, api_key=API_KEY, api_secret=API_SECRET):
        auth = tw.OAuthHandler(API_KEY, API_SECRET)
        auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
        # authenticate
        self.api = tw.API(auth, wait_on_rate_limit=True)
    
    @exception_handler
    def check_rate_limit(self):
        try:
            data = self.api.rate_limit_status()
            return data
        except Exception as e:
            raise(e)
        
    def get_raw_tweets_from_api(
        self, 
        q, 
        date_range, 
        lang, 
        count
    ):
        tweet_list = []
        try:
            for d in date_range:
                for tweet in self.api.search(
                          q=q,
                          lang=lang,
                          since=d, 
                          count=count
                ):
                    tweet_list.append(tweet._json)
        except RuntimeError as e:
            raise(e)
        
        for tweet in tweet_list:
            self.raw_tweets.append(tweet)

    def get_date_range(self, start, end):
        if not end:
            end = pd.to_datetime(start) + pd.DateOffset(days=7)
        date_range = pd.date_range(start=start, end=end, freq="W").map(lambda t: t.strftime('%Y-%m-%d'))
        return date_range
    
    def get_clean_tweets(self):
        self.clean_tweets_df = pd.json_normalize(self.raw_tweets)
    
    def get_twitter_data_as_dataframe(
        self, 
        q, 
        start, 
        end=None, 
        lang="en", 
        count=500, 
    ):
        date_range = self.get_date_range(start, end)
        self.get_raw_tweets_from_api(q, date_range, lang, count)
        self.get_clean_tweets()
        return self.clean_tweets_df
    

In [73]:
SEARCH_QUERY = "#globalwarming -filter:retweets"
## If end is None then you will only get one week of data
START = "2021-01-01"
END = None
LANG = "en"
COUNT = 500

api = TwitterApi()
tweet_df = api.get_twitter_data_as_dataframe(q=SEARCH_QUERY, start=START, end=END, lang=LANG, count=COUNT)

In [74]:
tweet_df.shape

(100, 158)

In [75]:
tweet_df.head()

Unnamed: 0,created_at,id,id_str,text,truncated,source,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,quoted_status.is_quote_status,quoted_status.retweet_count,quoted_status.favorite_count,quoted_status.favorited,quoted_status.retweeted,quoted_status.possibly_sensitive,quoted_status.lang,quoted_status.user.entities.url.urls,entities.media,extended_entities.media
0,Sat Jun 05 19:41:02 +0000 2021,1401262810873700352,1401262810873700352,#BreakingBoundaries simple must watch. Time to...,False,"<a href=""http://twitter.com/download/iphone"" r...",,,,,...,,,,,,,,,,
1,Sat Jun 05 19:26:23 +0000 2021,1401259123463921668,1401259123463921668,Ahhhh.... just wait for this year's #wildfires...,True,"<a href=""http://twitter.com/download/android"" ...",,,,,...,,,,,,,,,,
2,Sat Jun 05 19:21:00 +0000 2021,1401257766233284610,1401257766233284610,Post 1103: 3°C average increase in temperature...,True,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,,,...,,,,,,,,,,
3,Sat Jun 05 19:19:41 +0000 2021,1401257437223587844,1401257437223587844,Empowering local communities 2 manage coral re...,True,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,,,...,,,,,,,,,,
4,Sat Jun 05 19:13:43 +0000 2021,1401255934568054794,1401255934568054794,".@MichaelEMann: ""This won't lead to another ic...",True,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,,,...,,,,,,,,,,
