In [13]:
import requests
import os
import json
import pandas as pd
import csv
import datetime
import dateutil.parser
import unicodedata
import time

In [14]:
os.environ["TOKEN"] = "BEARER_TOKEN"

def auth(): 
    return os.getenv("TOKEN")

In [15]:
def create_headers(bearer_token): 
    headers = {"Authorization": "Bearer {}".format("BEARER_TOKEN")}
    return headers

In [16]:
def create_url(keyword, start_date, end_date, max_results = 10):
    
    search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    query_params = {'query': keyword,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': {}}
    return (search_url, query_params)

In [17]:
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [20]:
def append_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Author ID
        author_id = tweet['author_id']

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Geolocation
        if ('geo' in tweet):   
            geo = tweet['geo']['place_id']
        else:
            geo = " "

        # 4. Tweet ID
        tweet_id = tweet['id']

        # 5. Language
        lang = tweet['lang']

        # 6. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        # 7. source
        source = tweet['source']

        # 8. Tweet text
        text = tweet['text']
        
        # Assemble all data in a list
        res = [author_id, created_at, geo, tweet_id, lang, like_count, quote_count, reply_count, retweet_count, source, text]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 

In [33]:
import datetime
from dateutil.parser import parse
from dateutil.rrule import rrule, DAILY, MO, TU, WE, TH, FR

start = datetime.datetime(2019,10,28);
end = datetime.datetime(2022,10,27);

daysWithoutWeekend = rrule(
  DAILY,
  byweekday=(MO,TU,WE,TH,FR),
  dtstart=start,
  until=end
)

start_list = []
end_list = []

for entry in daysWithoutWeekend:
    start_list.append(entry.strftime("%Y-%m-%d") + "T00:01:00.000Z")
    end_list.append(entry.strftime("%Y-%m-%d") + "T23:59:00.000Z")

for start, end in zip(start_list, end_list): 
    print(start)
    print(end)
    

2020-11-16T00:01:00.000Z
2020-11-16T23:59:00.000Z
2020-11-17T00:01:00.000Z
2020-11-17T23:59:00.000Z
2020-11-18T00:01:00.000Z
2020-11-18T23:59:00.000Z
2020-11-19T00:01:00.000Z
2020-11-19T23:59:00.000Z
2020-11-20T00:01:00.000Z
2020-11-20T23:59:00.000Z
2020-11-23T00:01:00.000Z
2020-11-23T23:59:00.000Z
2020-11-24T00:01:00.000Z
2020-11-24T23:59:00.000Z
2020-11-25T00:01:00.000Z
2020-11-25T23:59:00.000Z
2020-11-26T00:01:00.000Z
2020-11-26T23:59:00.000Z
2020-11-27T00:01:00.000Z
2020-11-27T23:59:00.000Z
2020-11-30T00:01:00.000Z
2020-11-30T23:59:00.000Z
2020-12-01T00:01:00.000Z
2020-12-01T23:59:00.000Z
2020-12-02T00:01:00.000Z
2020-12-02T23:59:00.000Z
2020-12-03T00:01:00.000Z
2020-12-03T23:59:00.000Z
2020-12-04T00:01:00.000Z
2020-12-04T23:59:00.000Z
2020-12-07T00:01:00.000Z
2020-12-07T23:59:00.000Z
2020-12-08T00:01:00.000Z
2020-12-08T23:59:00.000Z
2020-12-09T00:01:00.000Z
2020-12-09T23:59:00.000Z
2020-12-10T00:01:00.000Z
2020-12-10T23:59:00.000Z
2020-12-11T00:01:00.000Z
2020-12-11T23:59:00.000Z


In [34]:
#Inputs for tweets
bearer_token = auth()
headers = create_headers(bearer_token)
                
max_results = 500


def get_tweets(bearer_token, headers, keyword, start_list, end_list, max_results, csv_name):
    
    #Total number of tweets we collected from the loop
    total_tweets = 0
    # Create file
    csvFile = open(csv_name, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Create headers for the data you want to save, in this example, we only want save these columns in our dataset
    csvWriter.writerow(['author id', 'created_at', 'geo', 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet'])
    csvFile.close()

    for i in range(0,len(start_list)):

        # Inputs
        count = 0 # Counting tweets per time period
        max_count = 100 # Max tweets per time period
        flag = True
        next_token = None

        # Check if flag is true
        while flag:
            # Check if max_count reached
            if count >= max_count:
                break
            print("-------------------")
            print("Token: ", next_token)
            url = create_url(keyword, start_list[i],end_list[i], max_results)
            json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
            result_count = json_response['meta']['result_count']

            if 'next_token' in json_response['meta']:
                # Save the token to use for next call
                next_token = json_response['meta']['next_token']
                print("Next Token: ", next_token)
                if result_count is not None and result_count > 0 and next_token is not None:
                    print("Start Date: ", start_list[i])
                    append_to_csv(json_response, csv_name)
                    count += result_count
                    total_tweets += result_count
                    print("Total # of Tweets added: ", total_tweets)
                    print("-------------------")
                    time.sleep(5)                
            # If no next token exists
            else:
                if result_count is not None and result_count > 0:
                    print("-------------------")
                    print("Start Date: ", start_list[i])
                    append_to_csv(json_response, csv_name)
                    count += result_count
                    total_tweets += result_count
                    print("Total # of Tweets added: ", total_tweets)
                    print("-------------------")
                    time.sleep(5)

                #Since this is the final request, turn flag to false to move to the next time period.
                flag = False
                next_token = None
            time.sleep(5)
    print("Total number of results: ", total_tweets)

In [None]:
get_tweets(bearer_token, headers, "AAPL", start_list, end_list, max_results, "aapl_tweets.csv")

In [None]:
get_tweets(bearer_token, headers, "BTC", start_list, end_list, max_results, "btc_tweets.csv")

In [None]:
get_tweets(bearer_token, headers, "JNJ", start_list, end_list, max_results, "jnj_teets.csv")

In [None]:
get_tweets(bearer_token, headers, "MSFT", start_list, end_list, max_results, "msft_tweets.csv")

In [None]:
get_tweets(bearer_token, headers, "NFLX", start_list, end_list, max_results, "nflx_tweets.csv")

In [None]:
get_tweets(bearer_token, headers, "PFE", start_list, end_list, max_results, "pfe_tweets.csv")

In [None]:
get_tweets(bearer_token, headers, "TSLA", start_list, end_list, max_results, "tsla_tweets.csv")

In [None]:
get_tweets(bearer_token, headers, "TWR", start_list, end_list, max_results, "twr_tweets.csv")