In [27]:
import requests
import os
import json
import time
import pandas as pd
import numpy as np

from datetime import datetime
from pytz import timezone


#Academic申請後に生成される"Bearer Token(BT)"
BT = '###############################'


search_url = "https://api.twitter.com/2/tweets/search/all"

#与えられたパラメータからツイート取得条件(query_params)を生成する関数
def make_parm(keyword, start_time, end_time):

    #検索条件(今回はキーワード、開始時刻、修了時刻のみ)
    query_params = {'query': keyword ,
                    'tweet.fields': 'created_at',
                    'expansions': 'author_id,geo.place_id',
                    'start_time': start_time,
                    'end_time': end_time,
                    'user.fields': 'created_at,description,location,username',
                    'max_results':500,  #一回のqueryは５００で上限らしい
                    'next_token' : {} #次のページにいくためのparam
                   }
    return query_params


def create_headers(BT):
    headers = {"Authorization": "Bearer {}".format(BT)}
    return headers


def connect_to_endpoint(url, headers, params):
    response = requests.request("GET", search_url, headers=headers, params=params)
    #print(response.status_code)
    if response.status_code != 200:
        #raise Exception(response.status_code, response.text)
    
        print(response.status_code, response.text)
        
    return response   ######## to make use of  header information, return response instead of response.json()

def convert_timezone(normalized_data):
    normalized_data.created_at = pd.to_datetime(normalized_data.created_at, utc=True)
    normalized_data.index = pd.DatetimeIndex(normalized_data.created_at, name='created_at')
    normalized_data.index = normalized_data.index.tz_convert('Asia/Tokyo')
    
    # sort dataframe by created time
    
    normalized_data.created_at = normalized_data.index
    normalized_data = normalized_data.sort_index()
    normalized_data = normalized_data.reset_index(drop=True)
    
    # rename index
    
    print('Timezone convertion finished')
    return normalized_data

def store_checkpoint(normalized_data, keyword):
    
    # store collected data to csv in case of error occurrence
    
    savedir = './' # set your savedir here
    
    kwindex = keyword
        
    filename = kwindex + normalized_data.iloc[-1]['created_at'].strftime('%Y%m%d%H%M') + '-' + \
               normalized_data.iloc[0]['created_at'].strftime('%Y%m%d%H%M') + \
                '.csv'
        
    savepath = savedir + filename    
    
    normalized_data.to_csv(path_or_buf = savepath)
    print('Successfully saved at ' + savepath + ' at ' + \
              datetime.today().strftime("%Y/%m/%d-%H:%M"))
    
def json_to_df(json_response):
    
    normalized_data = pd.json_normalize(json_response['data'])
    normalized_users = pd.json_normalize(json_response['includes']['users']).set_index('id')
     
    normalized_users.rename({'id': 'author_id'}, inplace = True, axis = 1)
    normalized_data.rename({'geo.place_id': 'place_id'}, inplace = True, axis = 1)
            
    normalized = normalized_data.join(normalized_users, on = 'author_id', how = 'outer', rsuffix = '_user')
    normalized['text'] = normalized['text'].str.replace('\n', ' ')     #text内の\nを削除
    normalized['text'] = normalized['text'].str.replace('@', ' @')     #@前にスペースを追加
            
    return normalized
    


#実際にツイート取得をする関数(上の5つの関数を呼び出しながら)
def main(keyword, start_time, end_time):

    count = 0
    flag = True
    TWEET_LIMIT = 1000
    
    
    normalized_data_old = pd.DataFrame(columns=['created_at','id', 'text', 'author_id',  
                                            'place_id', 'description',
                                            'name', 'verified', 'username', 'created_at_user',
                                            'location'])
    
    #cols = ['created_at', 'id', ' text']
    #normalized_data_old = pd.DataFrame(index=[], columns=cols)#create checkpoint dataframe
    
    query_params = make_parm(keyword, start_time, end_time)#ここで取得条件が返される
    
    while flag:
        if count >= TWEET_LIMIT:
            break
    
        headers = create_headers(BT)
        time.sleep(1)
        
        response = connect_to_endpoint(search_url, headers, query_params)
        json_response = response.json()  
        
        
        #print(json_response['data']) 
        #print(json_response) 
        
        normalized_data = json_to_df(json_response)
        
        normalized_data_new = pd.concat([normalized_data_old, normalized_data], ignore_index=True)
         
        normalized_data_old = normalized_data_new
        
        print("total:" + str(len(normalized_data_old)) + "tweets")
    

        result_count = json_response['meta']['result_count']
        
        if 'next_token' in json_response['meta']:
            next_token = json_response['meta']['next_token']
            query_params['next_token'] = next_token
            count += result_count
            
            time.sleep(3)
            
            response = connect_to_endpoint(search_url, headers, query_params)
            time.sleep(3)
            
            
            if response.status_code == 200:
                
                continue
            
            elif response.status_code == 429:
                
                # store checkpoints when error occurs
                
                normalized_data_last = convert_timezone(normalized_data_old)
                store_checkpoint(normalized_data_last, keyword)
                
                # from https://github.com/mammalofski/Twitter-Scraper/blob/main/Scraper.py
                
                print('too many requests ... ')
                print('the header is ', response.headers)
                throttle_end_timestamp = int(response.headers.get('x-rate-limit-reset'))
                throttle_end_time = datetime.strftime(datetime.fromtimestamp(throttle_end_timestamp), "%H:%M:%S")
                time_to_wait = int(throttle_end_timestamp - datetime.now().timestamp()) + 5
                print('lets rest for', time_to_wait, 'seconds and wake up at', throttle_end_time)
                print('sleeping ...')
                time.sleep(time_to_wait)
                response = connect_to_endpoint(search_url, headers, query_params)
                
            else:
                normalized_data_last = convert_timezone(normalized_data_old)
                store_checkpoint(normalized_data_last, keyword)
                
                print('un expected error : ')
                print(response.status_code, response.text)
                print('sleep for 15 min')
                time.sleep( 15 * 60 + 10)  # rest 15min 
                print('restart at ' + datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S"))
                
                response = connect_to_endpoint(search_url, headers, query_params)
             
            time.sleep(3)  # rate limit = 1 request/1 sec
            
            json_response = response.json()
            
            
            #append the current page of results to lists
            
            
            normalized_data = json_to_df(json_response)
        
            normalized_data_new = pd.concat([normalized_data_old, normalized_data], ignore_index=True)
            
            normalized_data_old = normalized_data_new
            
            print("total:" + str(len(normalized_data_old)) + "tweets")

        else:
            flag = False
            print('Last page of query results')
            
            
    
    # convert UTC +0 timezone to UTC +9 timezone
    normalized_data_last = convert_timezone(normalized_data_old)
    
    if count % 500 <= 100:
        print('Finished collecting tweets at ' + str(normalized_data_last['created_at'][-1]))
    
    
    # store checkpoint to csv
    
    store_checkpoint(normalized_data_last, keyword)
    
    #normalized_data_last.to_csv('../FullArchiveData_2021Apr/T20200331-0501.csv')#csvファイルに保存

In [33]:
# filter example #

# keyword + ' lang:ja -診断メーカー -募集中 -みんなからの匿名質問を募集中！ -DM -エロ -ホワイトライオン -日経新聞 -東京新聞 -朝日新聞デジタル -日本経済新聞 -毎日新聞 -産経新聞 -埼玉新聞 -@Sankei_news -青森ニュース -産経ニュース -#linenews -is:reply -is:retweet'

In [28]:
main(keyword="テニス", start_time="2021-02-15T00:00:00Z", end_time="2021-02-15T09:00:00Z")

total:478tweets
total:947tweets
total:1423tweets
Timezone convertion finished
Successfully saved at ./テニス202102151759-202102151514.csv at 2021/05/06-13:48


In [29]:
test = pd.read_csv('./テニス202102151759-202102151514.csv',usecols=['created_at','id', 'text', 'author_id'],dtype={'id':str})

In [30]:
test

Unnamed: 0,created_at,id,text,author_id
0,2021-02-15 15:14:14+09:00,1361197105105178630,ＴＨＥ ＳとＭワンセットマッチ 性感サービスプレイ❤　‹あいつこそがテニスの王子様›より,519225079
1,2021-02-15 15:14:18+09:00,1361197122972831745,【NEWラケット】S-MACHPRO 97 295 Ver.2.0【トアルソン】 https...,115589833
2,2021-02-15 15:14:23+09:00,1361197143449460738,神がテニス部を作るとき　誰の投稿かわからんけど https://t.co/qUATA7ih...,2477696780
3,2021-02-15 15:14:58+09:00,1361197291219128322,@PT_shiba テニス肘が緩和される方法あれば教えてほしいです💧,952556141023715328
4,2021-02-15 15:15:03+09:00,1361197309896372226,国枝２連覇ならず、上地は決勝進出…全豪車いすテニス https://t.co/8M73n8Q...,109488235
...,...,...,...,...
1419,2021-02-15 17:59:46+09:00,1361238762752077827,@tae_kumachan ちょこちゃんテニス部だったんだね🎾 私は帰宅部🏡笑,1354031688435658753
1420,2021-02-15 17:59:50+09:00,1361238782192754694,いつでも地味なテニス部,1364098464
1421,2021-02-15 17:59:51+09:00,1361238785002905603,@namseokdiary だよねwww迷わず会員になる😭 テニス教室のために働くと言って...,1105812375255474177
1422,2021-02-15 17:59:56+09:00,1361238806276362241,RT @nhk_chubu: あす16日(火)の #さらさらサラダ は 全豪オープンテニス...,191710487
