### Import packages & setup

In [2]:
import datetime
import tweepy
from tweepy import OAuthHandler
import json
import pandas as pd
import csv
import os
import time

In [3]:
# ACADEMIC ACCOUNT. Set connection to Tweepy. I've put my API keys in a .py file called API_keys.py
from my_api_keys import bearer_token
tweepyclient = tweepy.Client(bearer_token, wait_on_rate_limit=True)

In [63]:
# Set parameters

start_time = "2022-11-07T00:00:00Z"
end_time = "2022-11-08T00:00:00Z"
query = '#trailrunning OR #running OR #run OR #trail OR #trailrunner OR #trailrun -is:retweet lang:en'
response_perpg = 10
num_pgs = 3
audience_type = "trailrunning"

In [78]:
def pulltweets(query, start_time, end_time, response_perpg, num_pgs):

    tweet_list = []
    func_start = time.time()
    
    # response is like a "page" of tweets. Num of tweets per "page" is set by "max_results"
    for response in tweepy.Paginator(tweepyclient.search_all_tweets, 
                                     query = query,
                                     user_fields = ['username', 'public_metrics', 'description', 'location'],
                                     tweet_fields = ['created_at', 'geo', 'public_metrics', 'text'],
                                     expansions = 'author_id',
                                     start_time = start_time,
                                     end_time = end_time,
                                     max_results=response_perpg):  #pull "response_perpg" tweets per response

        if len(tweet_list)+1 > num_pgs :         # Flag to quit when we need to
            break
        else:
            time.sleep(1)  # only 1 request per second allowed
            #print(f'page {len(tweet_list)+1}')
            tweet_list.append(response)

    func_end = time.time()
    print(f'Pulled {len(tweet_list)} pages and {len(tweet_list)*response_perpg} tweets')
    print('Pull time was {} minutes.'.format(round(func_end - func_start)/60, 2))    
    
    return(tweet_list)

In [79]:
tweet_list = pulltweets(query, start_time, end_time, response_perpg, num_pgs)

Pulled 3 pages and 30 tweets
Pull time was 0.06666666666666667 minutes.


In [80]:
def processtweets(tweet_list):
    
    result = []
    user_dict = {}
    
    # Loop through each response object
    for response in tweet_list:
        # Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
        for user in response.includes['users']:
            user_dict[user.id] = {'username': user.username, 
                                  'followers': user.public_metrics['followers_count'],
                                  'tweets': user.public_metrics['tweet_count'],
                                  'description': user.description,
                                  'location': user.location
                                 }
        for tweet in response.data:
            # For each tweet, find the author's information
            author_info = user_dict[tweet.author_id]
            # Put all of the information we want to keep in a single dictionary for each tweet
            result.append({'user_id': tweet.author_id, 
                           'username': author_info['username'],
                           'follower_count': author_info['followers'],
                           'total_tweets': author_info['tweets'],
                           'description': author_info['description'],
                           'location': author_info['location'],
                           'tweet_id' : tweet.id,
                           'text': tweet.text,
                           'created_at': tweet.created_at,
                           'retweets_count': tweet.public_metrics['retweet_count'],
                           'replies_count': tweet.public_metrics['reply_count'],
                           'likes_count': tweet.public_metrics['like_count'],
                           'quote_count': tweet.public_metrics['quote_count']
                          })

    # Change this list of dictionaries into a dataframe
    df = pd.DataFrame(result)
    
    # Let's see how long it took to grab all tweets and how many were pulled
    print('Processed {} tweets'.format(len(df)))

    return(df)

In [86]:
def pullandprocess(query, start_time, end_time, response_perpg, num_pgs, audience_type):

    tweet_list = pulltweets(query, start_time, end_time, response_perpg, num_pgs)
    
    tweetsdf = processtweets(tweet_list)
    
    # Obtain timestamp in a readable format
    to_csv_timestamp = datetime.date.today().strftime('%Y%m%d_%H%M%S')
    
    # Define working path
    path = os.getcwd()
    
    # define filename location, timestamp, and custom audience type
    filename = path + '/data/' + to_csv_timestamp + '_tweets_' + audience_type + '.csv'
    
    # Store dataframe in csv with creation date timestamp
    tweetsdf.to_csv(filename, index = False)
    
    return

In [87]:
pullandprocess(query, start_time, end_time, response_perpg, num_pgs, audience_type)

Pulled 3 pages and 30 tweets
Pull time was 0.06666666666666667 minutes.
Processed 30 tweets


## Ignore for now: testing adding followers & following to df

In [None]:
def get_followers_id(person):
    followersid = []
    count=0
    user=api.get_user(screen_name=person)
    user_id=user.id
    number_of_followers=user.followers_count
    status = tweepy.Cursor(api.get_follower_ids, screen_name=person, tweet_mode="extended").items()
    for i in range(0,number_of_followers):
        follower=next(status)
        followersid.append(follower)
        count += 1
    return followersid

In [None]:
get_followers_id("37chandler")
