# Example of code to acquire data from Twitter
See the [Twitter Developer Program](https://developer.twitter.com/en/docs/developer-portal/overview)

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import requests
import os
import json

## Access tokens
The bearer is the user specific key for accessing the API. Substitute the path here with your own token.

In [3]:
with open('/Users/flint/Data/twitter/tokens/bearer', 'r') as bfile:
    bearer_token = bfile.read().rstrip('\n')
search_url = "https://api.twitter.com/2/tweets/search/all"

## Query composition
This is an example of a query for getting data from a timeline with time bounds.

In [4]:
from datetime import datetime, timedelta

In [5]:
time2str = lambda x: x.strftime("%Y-%m-%dT%H:%M:%S.00Z")

In [6]:
def query(timeline, starting_time, hour_delta=4, max_results=100):
    return {
        'query': 'from:{}'.format(timeline),
        'tweet.fields': 'author_id,public_metrics,created_at,context_annotations,conversation_id,geo',
        'start_time': time2str(starting_time),
        'end_time': time2str(starting_time + timedelta(hours=hour_delta)),
        'max_results': '{}'.format(max_results)
    }

In [7]:
query('guardian', datetime.now(), hour_delta=4)

{'query': 'from:guardian',
 'tweet.fields': 'author_id,public_metrics,created_at,context_annotations,conversation_id,geo',
 'start_time': '2022-03-04T14:01:12.00Z',
 'end_time': '2022-03-04T18:01:12.00Z',
 'max_results': '100'}

## Request

In [8]:
def bearer_oauth(r):
    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2FullArchiveSearchPython"
    return r

def connect_to_endpoint(url, params):
    response = requests.request("GET", search_url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def timeline_search(timeline, starting_time, hour_delta=4, max_results=100):
    return connect_to_endpoint(search_url, query(
        timeline, starting_time, hour_delta=hour_delta, max_results=max_results))

In [9]:
from_date = datetime.now() - timedelta(hours=24)
sample = timeline_search('guardian', from_date, hour_delta=2, max_results=10)
sample['data'][0]

{'created_at': '2022-03-03T15:58:00.000Z',
 'conversation_id': '1499413791297228809',
 'text': 'Have your photos published in the Guardian’s letters pages https://t.co/Cl9EnBzgQ0',
 'public_metrics': {'retweet_count': 3,
  'reply_count': 2,
  'like_count': 19,
  'quote_count': 0},
 'id': '1499413791297228809',
 'context_annotations': [{'domain': {'id': '47',
    'name': 'Brand',
    'description': 'Brands and Companies'},
   'entity': {'id': '1066122339568386048',
    'name': 'The Guardian',
    'description': 'The Guardian'}}],
 'author_id': '87818409'}

## Download
Massive download for multiple timelines. Saves data into MongoDb. Use sleep to take care of the query rate for twitter (120 queries per minute).

**For this sample dataset we collect the last 100 days of news from the top UK newspapers by number of followers, plus BBC**

In [10]:
import time
import dateutil.parser

In [11]:
uk_news = ['FinancialTimes', 'guardiannews', 'MailOnline', 'DailyMailUK', 'Telegraph', 
           'TheSun', 'thetimes', 'DailyMirror', 'Daily_Express', 'BBC']
recipes = ['RecipesIdeas', 'OldFashRecipes', 'simplyrecipes', 'GTKrecipes', 'My_Recipes',
          'RecipeGirl', 'BestRecipesUK', 'EclecticRecipes']

In [12]:
starting_date = datetime.now() - timedelta(hours=24*100)

In [13]:
delta = 24
failed_queries = []
records = []
time_limit, query_count = 100, 0
run = list(range(0, 24*100 + 1, delta))
timelines = recipes

for deltas in tqdm(run):
    search_date = starting_date + timedelta(hours=deltas)
    for timeline in timelines:
        query_count += 1
        try:
            data = timeline_search(timeline, search_date, hour_delta=delta, max_results=100)
            for record in data['data']:
                r = record
                r['created_at'] = dateutil.parser.parse(record['created_at'])
                r['timeline'] = timeline
                records.append(r)
        except:
            failed_queries.append((timeline, search_date, delta))
            pass
        if query_count == time_limit:
            query_count = 0
            time.sleep(120)

  0%|          | 0/101 [00:00<?, ?it/s]

In [14]:
len(records), len(failed_queries)

(1166, 731)

### Save and re-run failed queries

In [15]:
import pymongo

In [16]:
db = pymongo.MongoClient()['twitter']['tweets']

In [17]:
db.insert_many(records)

<pymongo.results.InsertManyResult at 0x7fdb5801a280>

In [18]:
time_limit, query_count = 100, 0
records = []
for timeline, search_date, delta in tqdm(failed_queries):
    query_count += 1
    try:
        data = timeline_search(timeline, search_date, hour_delta=delta, max_results=100)
        for record in data['data']:
            r = record
            r['created_at'] = dateutil.parser.parse(record['created_at'])
            r['timeline'] = timeline
            records.append(r)
    except:
        pass
    if query_count == time_limit:
        query_count = 0
        time.sleep(120)

  0%|          | 0/731 [00:00<?, ?it/s]

In [19]:
len(records)

707

In [20]:
db.insert_many(records)

<pymongo.results.InsertManyResult at 0x7fdbdaacbc00>