In [None]:
# Imports, Setup, Functions
from TwitterAPI import TwitterAPI, TwitterPager
import yaml
import time
import json_lines
import json
import pandas as pd
import datetime

with open("config.yaml", 'r') as ymlfile:
    config = yaml.safe_load(ymlfile)

api = TwitterAPI(config['twitter']['api_key'], 
                 config['twitter']['api_secret_key'],
                 auth_type='oAuth2'
                )


# Collect full Tweets
def collect_tweets(search_term, since='', since_id='', filename='', max_id=''):
    print(search_term)
    tweets = []
    if since != '':
        query = '{0} since:{1}'.format(search_term, since)
    else:
        query = search_term
    r = TwitterPager(api, 'search/tweets', {'q': query, 
                                            'count':100, 
                                            'tweet_mode':'extended', 
                                            'since_id':since_id,
                                            'max_id':max_id,
                                            'result_type':'recent'})
    n = 0
    if filename == '':
        filename = search_term +'.jsonl'
        
    with open(filename, 'a', encoding='utf-8') as f:
        for item in r.get_iterator(wait=2):
            n += 1
            if n % 1000 == 0:
                print(item['created_at'] + ' ' + str(n))
            if 'full_text' in item:
                json.dump(item, f)
                f.write('\n')
            elif 'message' in item and item['code'] == 88:
                print ('SUSPEND, RATE LIMIT EXCEEDED: %s\n' % item['message'])
                break
        return tweets

# Collect only IDs and meta data of Tweets
def collect_tweet_ids(search_term, since='', since_id='', filename='', max_id=''):
    print(search_term)
    if since != '':
        query = '{0} since:{1}'.format(search_term, since)
    else:
        query = search_term
    r = TwitterPager(api, 'search/tweets', {'q': query, 
                                            'count':100, 
                                            'since_id':since_id,
                                            'max_id':max_id,
                                            'result_type':'recent'})
    n = 0
    if filename == '':
        filename = search_term +'.jsonl'
        
    with open(filename, 'a', encoding='utf-8') as f:
        for item in r.get_iterator(wait=2.2):
            n += 1
            if n % 1000 == 0:
                print(item['created_at'] + ' ' + str(n))
            if 'text' in item:
                json.dump({'created_at' : item['created_at'],
                           'id' : item['id'],
                           'user_id' : item['user']['id'],
                           'at_id' : item['in_reply_to_status_id'] if 'in_reply_to_status_id' in item else None,
                           'rt_id' : item['retweeted_status']['id'] if 'retweeted_status' in item else None,
                           'qt_id' : item['quoted_status']['id'] if 'quoted_status' in item else None
                          }, f)
                f.write('\n')
            elif 'message' in item and item['code'] == 88:
                print ('SUSPEND, RATE LIMIT EXCEEDED: %s\n' % item['message'])
                break
        return ()
    

def hydrate(ids, filename):
    n = 0
    chunks = [ids[x:x+100] for x in range(0, len(ids), 100)]
    while len(chunks) > 0:
        time.sleep(3) #300 requests per 15min
        chunk = chunks.pop()
        r = api.request('statuses/lookup', {'id': ','.join(map(str,chunk)),
                                                  'count':100,
                                                  'tweet_mode':'extended'
                                           })
    
        with open(filename, 'a', encoding='utf-8') as f:
            for item in r:
                n += 1
                if n % 100 == 0:
                    print(item['created_at'] + ' ' + str(n))
                if 'full_text' in item:
                    json.dump(item, f)
                    f.write('\n')
                elif 'message' in item and item['code'] == 88:
                    print ('SUSPEND, RATE LIMIT EXCEEDED: %s\n'.format(item['message']))
                    break
    return




def save_tweets(tweets, file):
    with open(file, 'a', encoding='utf-8') as f:
        n = 0
        for tweet in tweets:
            n += 1
            if n % 1000 == 0:
                print(tweet['created_at'] + ' ' + str(n))
            json.dump(tweet, f)
            f.write('\n')
            

def load_tweet_ids_from_jsonl(files):
    ids = set()

    for file in files:
        with open(file, 'rb') as f:
            for tweet in json_lines.reader(f, broken=True):
                ids.add(tweet['id'])
    return (ids)



In [None]:
# Get highest ID of last collection
ids = load_tweet_ids_from_jsonl(['lang_de-2020-11-28_IDs.jsonl'])

idslist = list(ids)
idslist.sort()
highest_id = idslist[-1]
print(highest_id)

del(ids)
del(idslist)

In [None]:
# Start new collection
with open('collection_log.txt', 'a') as f:
    f.write(f'{datetime.datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d %H:%M:%S")} Collection started \n')

collect_tweet_ids('lang:de',
                  filename='lang_de-2020-11-30_IDs.jsonl',
                  since_id = highest_id
                 )
with open('collection_log.txt', 'a') as f:
    f.write(f'{datetime.datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d %H:%M:%S")} Collection finished\n')