In [1]:
import requests
import os
import json
import time
import pandas as pd

from datetime import datetime
from pytz import timezone


#Academic申請後に生成される"Bearer Token(BT)"
BT = '###############################'
search_url = "https://api.twitter.com/2/tweets/search/all"

#与えられたパラメータからツイート取得条件(query_params)を生成する関数
def make_parm(keyword, start_time, end_time):

    #検索条件(今回はキーワード、開始時刻、修了時刻のみ)
    query_params = {'query': keyword ,
                    'tweet.fields': 'created_at',
                    #'expansions': 'author_id',
                    'start_time': start_time,
                    'end_time': end_time,
                    #'user.fields': 'description', # profile information of author
                    'max_results':500,  #一回のqueryは５００で上限らしい
                    'next_token' : {} #次のページにいくためのparam
                   }
    return query_params


def create_headers(BT):
    headers = {"Authorization": "Bearer {}".format(BT)}
    return headers


def connect_to_endpoint(url, headers, params):
    response = requests.request("GET", search_url, headers=headers, params=params)
    #print(response.status_code)
    if response.status_code != 200:
        #raise Exception(response.status_code, response.text)
    
        print(response.status_code, response.text)
        
    return response   ######## to make use of  header information, return response instead of response.json()

def convert_timezone(normalized_data):
    normalized_data.created_at = pd.to_datetime(normalized_data.created_at, utc=True)
    normalized_data.index = pd.DatetimeIndex(normalized_data.created_at, name='created_at')
    normalized_data.index = normalized_data.index.tz_convert('Asia/Tokyo')
    
    # sort dataframe by created time
    
    normalized_data.created_at = normalized_data.index
    normalized_data = normalized_data.reset_index(drop=True)
    
    # rename index
    
    print('timezone convertion finished')
    return normalized_data

def store_checkpoint(normalized_data, keyword):
    
    # store collected data to csv in case of error occurrence
    
    savedir = './' # set your savedir here
    
    kwindex = keyword
        
    filename = kwindex + normalized_data.iloc[-1]['created_at'].strftime('%Y%m%d%H%M') + '-' + \
               normalized_data.iloc[0]['created_at'].strftime('%Y%m%d%H%M') + \
                '.csv'
        
    savepath = savedir + filename    
    
    normalized_data.to_csv(path_or_buf = savepath)
    print('successfully saved at' + savepath + ' at ' + \
              datetime.today().strftime("%Y/%m/%d-%H:%M"))
    
    


#実際にツイート取得をする関数(上の5つの関数を呼び出しながら)
def main(keyword, start_time, end_time):

    count = 0
    flag = True
    TWEET_LIMIT = 1000000
    cols = ['created_at', 'id', ' text']
    normalized_data_old = pd.DataFrame(index=[], columns=cols)#create checkpoint dataframe
    
    query_params = make_parm(keyword, start_time, end_time)#ここで取得条件が返される
    
    while flag:
        if count >= TWEET_LIMIT:
            break
    
        headers = create_headers(BT)
        time.sleep(1)
        
        response = connect_to_endpoint(search_url, headers, query_params)
        json_response = response.json()  
        
        normalized_data = pd.json_normalize(json_response['data']) 
        normalized_data_new = pd.concat([normalized_data_old, normalized_data])
        normalized_data_old = normalized_data_new
        print("total:" + str(len(normalized_data_old)) + "tweets")
    

        result_count = json_response['meta']['result_count']
        if 'next_token' in json_response['meta']:
            next_token = json_response['meta']['next_token']
            query_params['next_token'] = next_token
            count += result_count
            
            time.sleep(3)
            response = connect_to_endpoint(search_url, headers, query_params)
            time.sleep(3)
            
            
            if response.status_code == 200:
                
                continue
            
            elif response.status_code == 429:
                
                # store checkpoints when error occurs
                
                normalized_data_last = convert_timezone(normalized_data_old)
                store_checkpoint(normalized_data_last, keyword)
                
                # from https://github.com/mammalofski/Twitter-Scraper/blob/main/Scraper.py
                
                print('too many requests ... ')
                #print('the header is ', response.headers)
                throttle_end_timestamp = int(response.headers.get('x-rate-limit-reset'))
                throttle_end_time = datetime.strftime(datetime.fromtimestamp(throttle_end_timestamp), "%H:%M:%S")
                time_to_wait = int(throttle_end_timestamp - datetime.now().timestamp()) + 5
                print('lets rest for', time_to_wait, 'seconds and wake up at', throttle_end_time)
                print('sleeping ...')
                time.sleep(time_to_wait)
                response = connect_to_endpoint(search_url, headers, query_params)
                
            else:
                print(response.status_code, response.text)
                time.sleep( 15 * 60 )  # rest 15min 
                
                response = connect_to_endpoint(search_url, headers, query_params)
             
            time.sleep(3)  # rate limit = 1 request/1 sec
            
            json_response = response.json()
            normalized_data = pd.json_normalize(json_response['data'])
            normalized_data_new = pd.concat([normalized_data_old, normalized_data])
            normalized_data_old = normalized_data_new
            print("total:" + str(len(normalized_data_old)) + "tweets")

        else:
            flag = False
            print('Last page of query results')
    
    # convert UTC +0 timezone to UTC +9 timezone
    normalized_data_last = convert_timezone(normalized_data_old)
    
    # store checkpoint to csv
    
    store_checkpoint(normalized_data_last, keyword)
    
    #normalized_data_last.to_csv('AcademicAPI_tweet.csv')#csvファイルに保存

In [2]:
main(keyword="テニス", start_time="2021-02-15T00:00:00Z", end_time="2021-02-15T09:00:00Z")

total:447tweets
total:908tweets
total:1369tweets
total:1828tweets
total:2287tweets
total:2759tweets
total:3231tweets
total:3706tweets
total:4181tweets
total:4654tweets
total:5127tweets
total:5598tweets
total:6069tweets
total:6493tweets
total:6917tweets
total:7380tweets
total:7843tweets
total:7938tweets
total:8033tweets
