https://towardsdatascience.com/an-extensive-guide-to-collecting-tweets-from-twitter-api-v2-for-academic-research-using-python-3-518fcb71df2a

In [1]:
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For dealing with json responses we receive from the API
import json
# For displaying the data after
import pandas as pd
# For saving the response data in CSV format
import csv
# For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata
#To add wait time between requests
import time

In [2]:
os.environ['TOKEN'] = 'AAAAAAAAAAAAAAAAAAAAAKVtZgEAAAAAl9ydWGRRbzGoJ%2BtX9BQX2nqJOnI%3DDEZ9oWPKslhrVmlTMxyKbiWybhABOyAjLgGGhdZwAK0N9xrm8W'

In [3]:
def auth():
    return os.getenv('TOKEN')

In [4]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [17]:
def create_url(keyword, start_date, end_date, max_results = 10):
    
    search_url = "https://api.twitter.com/2/tweets/search/recent" #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    query_params = {'query': keyword,
                    'max_results': max_results,
                    #'expansions': , #'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'text,public_metrics,referenced_tweets,geo,created_at', #'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'name,username,created_at,description,public_metrics,verified', #'id,name,username,created_at,description,public_metrics,verified',
                    'place.fields': 'full_name,country,country_code,geo,name,place_type', #'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': {}}
    return (search_url, query_params)

In [18]:
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [19]:
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = "UCLA"
start_time = "2021-03-01T00:00:00.000Z"
end_time = "2021-03-31T00:00:00.000Z"
max_results = 15

In [20]:
url = create_url(keyword, start_time,end_time, max_results)
print(url)
json_response = connect_to_endpoint(url[0], headers, url[1])

('https://api.twitter.com/2/tweets/search/recent', {'query': 'UCLA', 'max_results': 15, 'tweet.fields': 'text,public_metrics,referenced_tweets,geo,created_at', 'user.fields': 'name,username,created_at,description,public_metrics,verified', 'place.fields': 'full_name,country,country_code,geo,name,place_type', 'next_token': {}})
Endpoint Response Code: 200


In [21]:
print(json.dumps(json_response, indent=4, sort_keys=True))

{
    "data": [
        {
            "created_at": "2022-03-02T06:46:50.000Z",
            "id": "1498912699446554624",
            "public_metrics": {
                "like_count": 0,
                "quote_count": 0,
                "reply_count": 0,
                "retweet_count": 0
            },
            "referenced_tweets": [
                {
                    "id": "1498891622699847684",
                    "type": "replied_to"
                }
            ],
            "text": "@zagafan2 @tucsontwitchy @glizzert00 @SethDavisHoops The win at ucla is impressive for sure. By 20 is a whoopin"
        },
        {
            "created_at": "2022-03-02T06:45:57.000Z",
            "id": "1498912474472583174",
            "public_metrics": {
                "like_count": 0,
                "quote_count": 0,
                "reply_count": 0,
                "retweet_count": 74
            },
            "referenced_tweets": [
                {
                    "id": "149882

In [22]:
# Create file
csvFile = open("data.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)

#Create headers for the data you want to save, in this example, we only want save these columns in our dataset
#csvWriter.writerow(['author id', 'created_at', 'geo', 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet'])
csvFile.close()

In [23]:
def append_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:
        print(tweet)
    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter)

In [None]:

        #'text,public_metrics,referenced_tweets,geo,created_at', #'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
         #           'user.fields': 'name,username,created_at,description,public_metrics,verified', #'id,name,username,created_at,description,public_metrics,verified',
          #          'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Author ID
        #author_id = tweet['author_id']

        # 2. Time created
        #created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Geolocation
        if ('geo' in tweet):   
            geo = tweet['geo']['place_id']
        else:
            geo = " "

        # 4. Tweet ID
        tweet_id = tweet['id']

        # 5. Language
        #lang = tweet['lang']

        # 6. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        # 7. source
        #source = tweet['source']

        # 8. Tweet text
        text = tweet['text']
        
        # Assemble all data in a list
        res = [ geo, tweet_id,  like_count, quote_count, reply_count, retweet_count, text]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1


In [26]:
print(json_response)

{'data': [{'referenced_tweets': [{'type': 'replied_to', 'id': '1498891622699847684'}], 'created_at': '2022-03-02T06:46:50.000Z', 'id': '1498912699446554624', 'public_metrics': {'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}, 'text': '@zagafan2 @tucsontwitchy @glizzert00 @SethDavisHoops The win at ucla is impressive for sure. By 20 is a whoopin'}, {'referenced_tweets': [{'type': 'retweeted', 'id': '1498823653022126084'}], 'created_at': '2022-03-02T06:45:57.000Z', 'id': '1498912474472583174', 'public_metrics': {'retweet_count': 74, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}, 'text': 'RT @ComplexSneakers: Air Jordan 6 “UCLA” PE 👀 [🎥: @ginamconti] https://t.co/pnfSMuz04U'}, {'created_at': '2022-03-02T06:45:41.000Z', 'id': '1498912408613826562', 'public_metrics': {'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}, 'text': 'Go Toトラベル利用者の方が新型コロナウイルス感染症を示唆する症状をより多く経験していることが明らかに⇒「不正確」\nSFSSが東京大学／UCLAの疫学調査研究をファクトチェック！\n#ファクトチェック\nhttps:/

In [24]:
append_to_csv(json_response, "data.csv")

{'referenced_tweets': [{'type': 'replied_to', 'id': '1498891622699847684'}], 'created_at': '2022-03-02T06:46:50.000Z', 'id': '1498912699446554624', 'public_metrics': {'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}, 'text': '@zagafan2 @tucsontwitchy @glizzert00 @SethDavisHoops The win at ucla is impressive for sure. By 20 is a whoopin'}
{'referenced_tweets': [{'type': 'retweeted', 'id': '1498823653022126084'}], 'created_at': '2022-03-02T06:45:57.000Z', 'id': '1498912474472583174', 'public_metrics': {'retweet_count': 74, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}, 'text': 'RT @ComplexSneakers: Air Jordan 6 “UCLA” PE 👀 [🎥: @ginamconti] https://t.co/pnfSMuz04U'}
{'created_at': '2022-03-02T06:45:41.000Z', 'id': '1498912408613826562', 'public_metrics': {'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}, 'text': 'Go Toトラベル利用者の方が新型コロナウイルス感染症を示唆する症状をより多く経験していることが明らかに⇒「不正確」\nSFSSが東京大学／UCLAの疫学調査研究をファクトチェック！\n#ファクトチェック\nhttps://t.co/a8Coon

In [25]:
import pandas as pd
df = pd.read_csv("data.csv")
df

Unnamed: 0,author id,created_at,geo,id,lang,like_count,quote_count,reply_count,retweet_count,source,tweet
0,author id,created_at,geo,id,lang,like_count,quote_count,reply_count,retweet_count,source,tweet
1,,1498910344315424768,0,0,0,84,RT @UCLAFootball: Make that 3️⃣7️⃣ unanswered ...,,,,
2,,1498910226753462272,0,0,0,72,RT @ComplexSneakers: Air Jordan 6 “UCLA” PE 👀 ...,,,,
3,,1498909950726320133,0,0,0,16,RT @schadenfraade: The Bel Air HOA is treating...,,,,
4,,1498909543715254279,0,0,1,0,"@pgz26 If it don't apply, let it fly...plus Lo...",,,,
5,,1498908653272776704,1,0,0,0,The NCAA tournament field better hope they cat...,,,,
6,,1498908637397336067,0,0,0,0,@BookingwWylie Fucking UCLA hit too. Texas Tec...,,,,
7,,1498908561954336769,1,0,0,0,@RyperiousPeople @equitybruin Can’t happen any...,,,,
8,,1498908520338526210,0,0,0,0,@Dawgman6_7 @JaedenJMoore @adamgorney @Mason_H...,,,,
9,,1498908170835542017,0,0,0,0,Hey @UCLA— your parking enforcement sucks.,,,,


In [13]:
pip install searchtweets

Collecting searchtweets
  Downloading searchtweets-1.7.6-py3-none-any.whl (29 kB)





Collecting tweet-parser
  Downloading tweet_parser-1.13.2-py3-none-any.whl (34 kB)
Installing collected packages: tweet-parser, searchtweets
Successfully installed searchtweets-1.7.6 tweet-parser-1.13.2


In [2]:
!curl -H "Authorization: rGIbhie1oukIY77d6iDSP23Nkk5YjkDC4tbLc14jFdAKNJsRK0" https://api.twitter.com/2/tweets/search/recent?query=from:ucla

{
  "title": "Unauthorized",
  "type": "about:blank",
  "status": 401,
  "detail": "Unauthorized"
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100    99  100    99    0     0    504      0 --:--:-- --:--:-- --:--:--   507


In [3]:
!curl -H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAAOX5ZQEAAAAA%2FO5lTq4JDWodZcwvVG15NS69ODs%3DMfOYyR9LzcEImALAAqcTDXhcsyOYESjzrSYJoJS7GwwTmhejb0" https://api.twitter.com/2/tweets/search/recent?query=ucla&tweet.fields=created_at

{"data":[{"id":"1498888865267994624","text":"RT @AfricanArchives: John Huggins and Bunchy Carter founded the Southern California chapter of the Black Panthers.\n\nThey were assassinatedâ€¦"},{"id":"1498888727510372352","text":"@JusCallMeLani My Texas Tech futures bet would 100% agree with you \uD83D\uDE02 Texas Tech is really similar to Auburn - their identity is defense, and if the shooting is there too then theyâ€™re unstoppable. UCLA is meh to me, theyâ€™ve been having trouble staying healthy, but Cronin knows what heâ€™s doing."},{"id":"1498888571817644040","text":"@DonMateo72 But USC is bad because they donâ€™t fucking try. UCLA is bad because they canâ€™t get it right."},{"id":"1498888346243850240","text":"RT @BruceFeldmanCFB: SOURCE: UCLA staffer Ramsen Golpashin, who been an analyst with the Bruins OL and DL, is expected to join the Packersâ€¦"},{"id":"1498888280846127107","text":"RT @UCLABaseball: B1 | Cody Schrier hits this one WAY out of here!\n\nIt's his first career home r

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  2548  100  2548    0     0  10167      0 --:--:-- --:--:-- --:--:-- 10192
'tweet.fields' is not recognized as an internal or external command,
operable program or batch file.


In [18]:
!curl -H "Authorization: Bearer AAAAAAAAAAAAAAAAAAAAAOX5ZQEAAAAA%2FO5lTq4JDWodZcwvVG15NS69ODs%3DMfOYyR9LzcEImALAAqcTDXhcsyOYESjzrSYJoJS7GwwTmhejb0" https://api.twitter.com/2/tweets/search/recent?query=cat%20has%3Amedia%20-grumpy&tweet.fields=created_at&max_results=100

{"data":[{"id":"1496975162448789510","text":"RT @JanRomes: Elaina, Tawny, Steph, and Grace have added Stony, the adorable Siberian Husky and Lula, the black cat to the mayhem. Fun andâ€¦"},{"id":"1496975161899237387","text":"RT @changkyuniesite: our cat stretching \uD83D\uDC08â€�â¬›â™¡ #ì•„ì�´ì—  #IM \n https://t.co/ZQ1gQb3Sp5"},{"id":"1496975160955527169","text":"RT @Simp_Cat_NFT: â™¦ï¸� #NFTGiveaway â™¦ï¸�\nGive away 1 cool  #NFTs~ 2 DAYS\n (Worth 1 ETH / $2500 \uD83D\uDCB8 )\n\nTo enter: \nâœ…Follow me @Simp_Cat_NFT\nâœ…RT &amp;â€¦"},{"id":"1496975160540286977","text":"cat time https://t.co/WrGYU6LIi1"},{"id":"1496975160359985162","text":"RT @xmfers: xmfer pets mint ends in 4  hours. \nWe are giving away 2 pets (winners choose dog or a cat just like in our mint)\nPets are fullyâ€¦"},{"id":"1496975159558868994","text":"RT @artofotaku: Ukrayna'da tanÄ±ÅŸtÄ±ÄŸÄ±m ve o zamandan beri benden hoÅŸlanan ukraynalÄ± arkadaÅŸÄ±m yarÄ±n savaÅŸa Ã§aÄŸrÄ±lÄ±nca Ã§at pat ingilizcesiyleâ€¦"},{"id":

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  1984  100  1984    0     0   4660      0 --:--:-- --:--:-- --:--:--  4679
'tweet.fields' is not recognized as an internal or external command,
operable program or batch file.
'max_results' is not recognized as an internal or external command,
operable program or batch file.





In [8]:
import requests
import os
import json

# To set your enviornment variables in your terminal run the following line:
# export 'BEARER_TOKEN'='<your_bearer_token>'
bearer_token = os.environ.get("AAAAAAAAAAAAAAAAAAAAAOX5ZQEAAAAA%2FO5lTq4JDWodZcwvVG15NS69ODs%3DMfOYyR9LzcEImALAAqcTDXhcsyOYESjzrSYJoJS7GwwTmhejb0")


def create_url():
    tweet_fields = "tweet.fields=lang,author_id"
    # Tweet fields are adjustable.
    # Options include:
    # attachments, author_id, context_annotations,
    # conversation_id, created_at, entities, geo, id,
    # in_reply_to_user_id, lang, non_public_metrics, organic_metrics,
    # possibly_sensitive, promoted_metrics, public_metrics, referenced_tweets,
    # source, text, and withheld
    ids = "ids=1278747501642657792,1255542774432063488"
    # You can adjust ids to include a single Tweets.
    # Or you can add to up to 100 comma-separated IDs
    url = "https://api.twitter.com/2/tweets?{}&{}".format(ids, tweet_fields)
    return url


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2TweetLookupPython"
    return r


def connect_to_endpoint(url):
    response = requests.request("GET", url, auth=bearer_oauth)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}".format(
                response.status_code, response.text
            )
        )
    return response.json()


def main():
    url = create_url()
    json_response = connect_to_endpoint(url)
    print(json.dumps(json_response, indent=4, sort_keys=True))

print(bearer_token)

None


In [10]:
import urllib
import json                 # Used to load data into JSON format
from pprint import pprint   # pretty-print

url = "https://api.twitter.com/2/tweets/search/recent?query=from:ucla"
response = urllib.request.urlopen(url)
print(response)

HTTPError: HTTP Error 401: Unauthorized