In [1]:
import numpy as np
import pandas as pd
import tweepy
import requests
import time
from datetime import date
from datetime import timedelta

In [2]:
#import key json
file_name = "../keys.json"
with open(file_name, "r") as key_file:
    keys = json.load(key_file)

In [3]:
# asign keys
bearer_token = keys['bearer_token']
consumer_key = keys['consumer_key']
consumer_secret = keys['consumer_secret']
access_token = keys['access_token']
token_secret = keys['token_secret']

In [4]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, token_secret)
api = tweepy.API(auth)

### Get dates

In [5]:
# returns a dataframe with dates, having random hours to be used in retrieving tweets

def get_dates_df():
    
    ##### get number of days to loop through #####
    def get_days_delta():
        delta = date.today()-date(2016, 1, 1)
        return delta.days

    ##### generate creates a list of dates from today to the start day in get_days_delta() #####
    dates_to_check = [(date.today() - timedelta(i)).isoformat() for i in range(get_days_delta())] 

    ##### generate random times and concat to each date  #####
    random_time_dates = []
    for i in dates_to_check:
        h = '00' #random.randint(0, 24)
        m = '00' #random.randint(0, 60)
        s = '00' #random.randint(0, 60)

        i =  f'{i}T{h}:{m}:{s}.000Z'
        random_time_dates.append(i)
        
    return random_time_dates

In [6]:

tweet_dates = get_dates_df()[0:2]
tweet_dates

['2021-11-24T00:00:00.000Z', '2021-11-23T00:00:00.000Z']

### Authenticate and define func to connect to api

In [7]:
search_url = "https://api.twitter.com/2/tweets/search/all"
query_params = {'query': "us infrastructure","end_time": "2021-11-19T14:44:18.000Z", "max_results":10, "tweet.fields":"public_metrics"}

def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "CryptoTrading699"
    return r



def connect_to_endpoint(url, params):
    response = requests.request("GET", search_url, auth=bearer_oauth, params=params)
#     print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()



In [8]:

def get_tweets(api_response, max_results=51):
    list_tweets = []
    
    for i in range(0,max_results):
        text = api_response['data'][i]['text']
        list_tweets.append(text)
    return list_tweets 


def get_tweets_ids(api_response, max_results=51):
    list_tweets_ids = []
    
    for i in range(0,max_results):
        ids = api_response['data'][i]['id']
        list_tweets_ids.append(ids)
    return list_tweets_ids


def get_dates(tweet_date, max_results=51):
    list_dates = []

    for i in range(0,max_results):
        list_dates.append(tweet_date)
    return list_dates


def get_topic(topic, max_results=51):
    list_topics = [topic for i in range(0,max_results)]
    return list_topics


### pull data from api and asign to dict

In [9]:
max_results = 51
topics = ['US infrastructure', 'Federal Reserve', 'US economy', 'US company, stock', 'US, pension, retirement', 'US inequality', 'living cost', 'US inflation', 'US recession','US banking']


tweeter_data = {
    'tweet':[],
    'tweet_date':[],
    'topic':[],
    'tweet_id':[],
}


# loop through dates
for tweet_date in tweet_dates:
    # loop through topics
    for topic in topics:
        query_params = {'query':topic ,"end_time": tweet_date, "max_results":max_results, "tweet.fields":"public_metrics"}
        json_response = connect_to_endpoint(search_url, query_params)
        available_tweets= len(json_response['data'])-1 # get number of tweets returned by the request if 

        tweeter_data['tweet'] += get_tweets(json_response, max_results=available_tweets)
        tweeter_data['tweet_id'] += get_tweets_ids(json_response, max_results=available_tweets)
        tweeter_data['tweet_date'] += get_dates(tweet_date, max_results=available_tweets)
        tweeter_data['topic'] += get_topic(topic, max_results=available_tweets)

        time.sleep(5)


In [10]:
# query_params = {'query':topic ,"end_time": tweet_date, "max_results":20, "tweet.fields":"public_metrics"}
# json_response = connect_to_endpoint(search_url, query_params)
# json_response['data']

In [11]:
# df = pd.DataFrame(tweeter_data)

In [15]:
df =pd.DataFrame(tweeter_data)
len(df['tweet_id'].unique())

936

In [16]:
df.to_csv('tweets_subset.csv',index=False)