### Import packages & api setup

In [1]:
import datetime
import tweepy
from tweepy import OAuthHandler
import json
import pandas as pd
import csv
import os
import time

In [2]:
# Set connection to Tweepy with academic account. I've put my API keys in a .py file called API_keys.py
from my_api_keys import bearer_token
tweepyclient = tweepy.Client(bearer_token, wait_on_rate_limit=True)

### Set custom parameters

In [3]:
start_time = "2022-10-01T00:00:00Z"
end_time = "2022-11-09T00:00:00Z"
query = '#climate OR #climatechange OR #sustainable OR #sustainability OR #climateaction OR #environment -is:retweet lang:en'
response_perpg = 500
num_pgs = 20
audience_type = "climate"

### Functions

In [4]:
def pulltweets(query, start_time, end_time, response_perpg, num_pgs):
    '''Pulls tweets from Twitter based on a query, limited to a defined timeframe and number of tweets.
    Places tweets in a list for further processing'''
    
    tweet_list = []
    func_start = time.time()
    
    # response is a single "page" of tweets. Num of tweets per "page" is set by "max_results"
    for response in tweepy.Paginator(tweepyclient.search_all_tweets, 
                                     query = query,
                                     user_fields = ['username', 'public_metrics', 'description', 'location'],
                                     tweet_fields = ['created_at', 'geo', 'public_metrics', 'text'],
                                     expansions = 'author_id',
                                     start_time = start_time,
                                     end_time = end_time,
                                     max_results=response_perpg):  #pull "response_perpg" tweets per response
        
        # Flag to quit when we need to, the +1 starts the count at 1 rather than 0
        if len(tweet_list)+1 > num_pgs :         
            break
        else:
            time.sleep(1)  # only 1 request per second allowed
            tweet_list.append(response)  # each response is a "page" of response_perpg tweets

    func_end = time.time()
    print(f'Pulled {len(tweet_list)} pages and {len(tweet_list)*response_perpg} tweets')
    print('Pull time was {} minutes.'.format(round(func_end - func_start)/60, 2))    
    
    return(tweet_list)

In [5]:
def processtweets(tweet_list):
    '''Processes tweets into a pandas dataframe'''
    
    result = []
    user_dict = {}
    
    # Loop through each response object
    for response in tweet_list:
        # Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
        for user in response.includes['users']:
            user_dict[user.id] = {'username': user.username, 
                                  'followers': user.public_metrics['followers_count'],
                                  'tweets': user.public_metrics['tweet_count'],
                                  'description': user.description,
                                  'location': user.location
                                 }
        for tweet in response.data:
            # For each tweet, find the author's information
            author_info = user_dict[tweet.author_id]
            # Put all of the information we want to keep in a single dictionary for each tweet
            result.append({'user_id': tweet.author_id, 
                           'username': author_info['username'],
                           'follower_count': author_info['followers'],
                           'total_tweets': author_info['tweets'],
                           'description': author_info['description'],
                           'location': author_info['location'],
                           'tweet_id' : tweet.id,
                           'text': tweet.text,
                           'created_at': tweet.created_at,
                           'retweets_count': tweet.public_metrics['retweet_count'],
                           'replies_count': tweet.public_metrics['reply_count'],
                           'likes_count': tweet.public_metrics['like_count'],
                           'quote_count': tweet.public_metrics['quote_count']
                          })

    # Change this list of dictionaries into a dataframe
    df = pd.DataFrame(result)
    
    # Let's see how long it took to grab all tweets and how many were pulled
    print('Processed {} tweets'.format(len(df)))

    return(df)

In [6]:
def pullandprocess(query, start_time, end_time, response_perpg, num_pgs, audience_type):
    '''Combines the pull and the process functions into a single function that pulls, processes, and writes
    dataframe to a csv file stored locally'''
    
    tweet_list = pulltweets(query, start_time, end_time, response_perpg, num_pgs)
    
    tweetsdf = processtweets(tweet_list)
    
    # Obtain timestamp in a readable format
    to_csv_timestamp = datetime.date.today().strftime('%Y%m%d_%H%M%S')
    
    # Define working path
    path = os.getcwd()
    
    # define filename location, timestamp, and custom audience type
    filename = path + '/data/' + to_csv_timestamp + '_tweets_' + audience_type + '.csv'
    
    # Store dataframe in csv with creation date timestamp
    tweetsdf.to_csv(filename, index = False)
    
    return

In [7]:
pullandprocess(query, start_time, end_time, response_perpg, num_pgs, audience_type)

Pulled 20 pages and 10000 tweets
Pull time was 0.7666666666666667 minutes.
Processed 9983 tweets
