In [1]:
import ssl
import yaml

ssl._create_default_https_context = ssl._create_unverified_context

import twitter

In [2]:
def load_config(path):
    with open(path, 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
    return config

In [3]:
def build_api(token_path):
    token_config = load_config(token_path)

    """アクセストークンの読み込み."""
    ConsumerKey = token_config['ConsumerKey']
    ConsumerSecret = token_config['ConsumerSecret']
    AccessToken = token_config['AccessToken']
    AccessTokenSecret = token_config['AccessTokenSecret']

    twitter_api = twitter.Twitter(auth=twitter.OAuth(AccessToken, AccessTokenSecret, ConsumerKey, ConsumerSecret), retry=True)
    return twitter_api

In [4]:
class TweetsGetter(object):
    """twitterのtweetを集める(親クラス).

    # Arguments
        token_path: twitter apiのtoken　configのpath
        max_count: apiの一回のリクエストで要求するtweet数
        ## set_default_params
        APIの返り値からcsvへの出力のkeyを設定

    # How to use
        getter = TweetsGetter(token_path, csv_dir)
        getter.set_csv_dir('data/text_data/crawler/nega')
        getter.set_info() # ここは継承ごとに変更
        getter.get_tweets()
        ==> csv_dir/'{}.csv'.format(root_tweet_id)に所得したreplyを保存

    """
    def __init__(self, token_path, max_count=100):
        self.api = build_api(token_path)
        self.max_count = max_count

    def get_tweets(self, since_id=None):
        """
        Subclasses should override for any actions to run.
        """
        tweets = []
        """
        self.apiを用いてデータを所得
        """
        return tweets


class UserTweetsGetter(TweetsGetter):
    """twitterの特定のUser_name(screen_name)のtweetを集める.

    # Arguments
        token_path: twitter apiのtoken　configのpath
        csv_dir: csvの保存先(２回目以降の場合はそのcsvの親のdiretoryのpath)
        max_count: apiの一回のリクエストで要求するtweet数
        ## set_default_params
        APIの返り値からcsvへの出力のkeyを設定

    # How to use
        getter = UserTweetsGetter(token_path)
        getter.set_user('peachgan_r6s')
        getter.get_tweets()
        ==> tweetsのlistを返し、self.since_id_dictにscreen_nameとsince_idを保持(前回分)

    """
    def __init__(self, token_path, max_count=100):
        self.since_id_dict = {}
        super(UserTweetsGetter, self).__init__(token_path, max_count)
        

    def set_user(self, screen_name):
        self.screen_name = screen_name
        self.since_id = self.since_id_dict.get(self.screen_name, None)
        
    def _set_since_id(self, since_id):
        self.since_id = since_id

    def get_tweets(self):
        max_id = None
        output_list = []
        while True:
            tweets, min_id, tweets_num = self._get_tweets_core(max_id)
            max_id = min_id - 1
            output_list.extend(tweets)
            if tweets_num < self.max_count:
                break
        if len(output_list) != 0:
            self.since_id_dict[self.screen_name] = output_list[0]['id']

        return output_list

    def _get_tweets_core(self, max_id=None):
        tweets = self._get_user_timeline(max_id)
        tweets_num = len(tweets)
        if tweets_num == 0:
            min_id = 0
        else:
            min_id = tweets[-1]['id']

        return tweets, min_id, tweets_num

    def _get_user_timeline(self, max_id):
        # 参考()
        if self.since_id is None:  # screen_nameの初めてのsearch
            if max_id is None:  # 1回目のsearch
                tweets = self.api.statuses.user_timeline(screen_name=self.screen_name,
                                                         count=self.max_count)
            else:  # 2回目のsearch
                tweets = self.api.statuses.user_timeline(screen_name=self.screen_name,
                                                         max_id=max_id, count=self.max_count)
        else:  # search_nameの二回目以降のsearch
            if max_id is None:  # 1回目のsearch
                tweets = self.api.statuses.user_timeline(screen_name=self.screen_name,
                                                         since_id=self.since_id,
                                                         count=self.max_count)
            else:
                tweets = self.api.statuses.user_timeline(screen_name=self.screen_name,
                                                         since_id=self.since_id,
                                                         max_id=max_id,
                                                         count=self.max_count)
        return tweets


class ReplyTweetsGetter(TweetsGetter):
    """twitterの特定のtweetの対するリプライを集める.

    # Arguments
        token_path: twitter apiのtoken　configのpath
        max_count: apiの一回のリクエストで要求するtweet数
        ## set_default_params
        APIの返り値からcsvへの出力のkeyを設定

    # How to use
        getter = ReplyTweetsGetter(token_path)
        getter.set_root_tweet(1115797923499892736)
        getter.get_tweets()
        ==> csv_dir/'{}.csv'.format(root_tweet_id)に所得したreplyを保存

    """
    def __init__(self, token_path, max_count=100):
        self.since_id_dict = {}
        super(ReplyTweetsGetter, self).__init__(token_path, max_count)

    def set_root_tweet(self, tweet_id):
        self.root_tweet_id = tweet_id
        self.since_id = self.since_id_dict.get(self.root_tweet_id, None)

    def get_tweets(self):
        output_list = []
        root_tweet = self.api.statuses.show(id=self.root_tweet_id)
        reply_list = self._get_replys(root_tweet)
        output_list.extend(reply_list)

        for reply in reply_list:
            reply_reply_list = self._get_replys(reply)
            output_list.extend(reply_reply_list)
            
        reply_id_list = [tweet['id'] for tweet in output_list]
        if len(reply_id_list) != 0:
            self.since_id_dict[self.root_tweet_id] = max(reply_id_list)
        
        return output_list

    def _get_replys(self, tweet):
        reply_list = []
        query = "to:" + tweet['user']['screen_name']
        tweet_id = tweet['id']
        if self.since_id is None:
            since_id = tweet_id
        else:
            since_id = max(self.since_id, tweet_id)

        max_id = None
        while True:
            responce_list = self._get_reply_core(query, since_id, max_id)
            if len(responce_list) == 0:
                break
            for responce in responce_list:
                if responce['in_reply_to_status_id'] == tweet_id:
                    reply_list.append(responce)
            if len(responce_list) < self.max_count:
                break
            max_id = responce_list[-1]['id'] - 1

        return reply_list

    def _get_reply_core(self, query, since_id, max_id=None):
        if max_id is None:  # 1回目のsearch
            tweets = self.api.search.tweets(q=query,
                                            since_id=since_id,
                                            count=self.max_count)['statuses']
        else:  # 2回目のsearch
            tweets = self.api.search.tweets(q=query,
                                            since_id=since_id,
                                            max_id=max_id,
                                            count=self.max_count)['statuses']

        return tweets



In [5]:
token_path = 'twitter_token.yml'
getter = ReplyTweetsGetter(token_path)
getter.set_root_tweet(1115797923499892736)
first_output = getter.get_tweets()
print ('data_num:{}'.format(len(first_output)))
first_output[0]

data_num:5


{'created_at': 'Wed Apr 10 02:06:43 +0000 2019',
 'id': 1115798231437307904,
 'id_str': '1115798231437307904',
 'text': '@lALpMD6hnfSXlp2 今日のランチはなにかなー',
 'truncated': False,
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [{'screen_name': 'lALpMD6hnfSXlp2',
    'name': 'まなびちゃん',
    'id': 1051429642110791681,
    'id_str': '1051429642110791681',
    'indices': [0, 16]}],
  'urls': []},
 'metadata': {'iso_language_code': 'ja', 'result_type': 'recent'},
 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',
 'in_reply_to_status_id': 1115797923499892736,
 'in_reply_to_status_id_str': '1115797923499892736',
 'in_reply_to_user_id': 1051429642110791681,
 'in_reply_to_user_id_str': '1051429642110791681',
 'in_reply_to_screen_name': 'lALpMD6hnfSXlp2',
 'user': {'id': 780396177606873088,
  'id_str': '780396177606873088',
  'name': 'Peach-gan(r6s初心者)',
  'screen_name': 'peachgan_r6s',
  'location': '',
  'description': 'youtubeで配信してます

In [6]:
getter.set_root_tweet(1115797923499892736)
second_output = getter.get_tweets()
print ('data_num:{}'.format(len(second_output)))
second_output

data_num:0


[]

In [7]:
getter = UserTweetsGetter(token_path)
getter.set_user('peachgan_r6s')
first_output = getter.get_tweets()
print ('data_num:{}'.format(len(first_output)))
first_output[0]

data_num:808


{'created_at': 'Tue Apr 16 00:00:40 +0000 2019',
 'id': 1117940835079712768,
 'id_str': '1117940835079712768',
 'text': 'こんだけすばらしい\n歴史的建造物が\n一瞬にして燃えるの悲しいが\n行っておいてよかった。\n\nかなり神秘的な空間だった。 https://t.co/CtrEBXG5wb',
 'truncated': False,
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [],
  'urls': [],
  'media': [{'id': 1117940798077595648,
    'id_str': '1117940798077595648',
    'indices': [59, 82],
    'media_url': 'http://pbs.twimg.com/media/D4O5O5VU8AAUNKt.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/D4O5O5VU8AAUNKt.jpg',
    'url': 'https://t.co/CtrEBXG5wb',
    'display_url': 'pic.twitter.com/CtrEBXG5wb',
    'expanded_url': 'https://twitter.com/peachgan_r6s/status/1117940835079712768/photo/1',
    'type': 'photo',
    'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
     'small': {'w': 680, 'h': 510, 'resize': 'fit'},
     'large': {'w': 2048, 'h': 1536, 'resize': 'fit'},
     'medium': {'w': 1200, 'h': 900, 'resize': 'fit'}}}]},
 'extended_e

In [8]:
getter.set_user('peachgan_r6s')
second_output = getter.get_tweets()
print ('data_num:{}'.format(len(second_output)))
second_output

data_num:0


[]

In [11]:
from get_tweet import UserTweetsGetter, ReplyTweetsGetter

In [13]:
token_path = 'twitter_token.yml'
getter = ReplyTweetsGetter(token_path)
getter.set_root_tweet(1115797923499892736)
first_output = getter.get_tweets()
print ('data_num:{}'.format(len(first_output)))
first_output[0]

data_num:5


{'created_at': 'Wed Apr 10 02:06:43 +0000 2019',
 'id': 1115798231437307904,
 'id_str': '1115798231437307904',
 'text': '@lALpMD6hnfSXlp2 今日のランチはなにかなー',
 'truncated': False,
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [{'screen_name': 'lALpMD6hnfSXlp2',
    'name': 'まなびちゃん',
    'id': 1051429642110791681,
    'id_str': '1051429642110791681',
    'indices': [0, 16]}],
  'urls': []},
 'metadata': {'iso_language_code': 'ja', 'result_type': 'recent'},
 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',
 'in_reply_to_status_id': 1115797923499892736,
 'in_reply_to_status_id_str': '1115797923499892736',
 'in_reply_to_user_id': 1051429642110791681,
 'in_reply_to_user_id_str': '1051429642110791681',
 'in_reply_to_screen_name': 'lALpMD6hnfSXlp2',
 'user': {'id': 780396177606873088,
  'id_str': '780396177606873088',
  'name': 'Peach-gan(r6s初心者)',
  'screen_name': 'peachgan_r6s',
  'location': '',
  'description': 'youtubeで配信してます

In [14]:
getter = UserTweetsGetter(token_path)
getter.set_user('peachgan_r6s')
first_output = getter.get_tweets()
print ('data_num:{}'.format(len(first_output)))
first_output[0]

data_num:808


{'created_at': 'Tue Apr 16 00:00:40 +0000 2019',
 'id': 1117940835079712768,
 'id_str': '1117940835079712768',
 'text': 'こんだけすばらしい\n歴史的建造物が\n一瞬にして燃えるの悲しいが\n行っておいてよかった。\n\nかなり神秘的な空間だった。 https://t.co/CtrEBXG5wb',
 'truncated': False,
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [],
  'urls': [],
  'media': [{'id': 1117940798077595648,
    'id_str': '1117940798077595648',
    'indices': [59, 82],
    'media_url': 'http://pbs.twimg.com/media/D4O5O5VU8AAUNKt.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/D4O5O5VU8AAUNKt.jpg',
    'url': 'https://t.co/CtrEBXG5wb',
    'display_url': 'pic.twitter.com/CtrEBXG5wb',
    'expanded_url': 'https://twitter.com/peachgan_r6s/status/1117940835079712768/photo/1',
    'type': 'photo',
    'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
     'small': {'w': 680, 'h': 510, 'resize': 'fit'},
     'large': {'w': 2048, 'h': 1536, 'resize': 'fit'},
     'medium': {'w': 1200, 'h': 900, 'resize': 'fit'}}}]},
 'extended_e