In [1]:
# Make sure that we reload custom-written modules each time to facilitate development
%reload_ext autoreload
%autoreload 2

In [11]:
from utils.meta_credentials import META_TEMP_TOKEN
import requests
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import time
import datetime
from loguru import logger
import pickle

In [None]:
logger.add("logs/meta_ads_api_calls_{time}.log")

In [3]:
return_fields = [
    'page_id',
    'page_name',
    'bylines',
    'ad_creative_bodies',
    'ad_creative_link_captions',
    'ad_creative_link_descriptions',
    'ad_creative_link_titles',
    'ad_delivery_start_time',
    'ad_delivery_stop_time',
    'ad_snapshot_url',
    'demographic_distribution',
    'delivery_by_region',
    'estimated_audience_size',
    'publisher_platforms',
    'spend',
    'currency',
    'impressions'
]

In [4]:
def update_page_status(pages):
    pages += 1
    print(f'Pages collected: {pages}', end='\r')
    
    return pages

In [5]:
class ResponseError(Exception):
    pass

In [6]:
def request_ads_handle_errors(base_url, cursor, wait_on_rate_limit, max_retries = 3):
    
    # Construct URL for next page
    next_url = base_url + f"&after={cursor}"
    
    # Get contents of next page
    response = requests.request("GET", next_url)
    
    if 'data' in response.json().keys():
        output = response
    
    # If we hit the (unknown) rate limit, the API will
    # return a JSON containing an error message, which
    # we'll print and handle accordingly
    elif 'error' in response.json().keys():
        
        retries = 0
        success = False
        
        error_message = response.json()['error']['message']
        logger.debug(f'ERROR: {error_message} at {next_url}.\nSleeping for {wait_on_rate_limit/60} mins. Retries: {retries}')
        
        # We'll sleep and try again up to 2 times
        while not success and retries < max_retries:
            
            time.sleep(wait_on_rate_limit)
            
            # Attempt to get contents of same page again,
            # until we reach max_retries
            response = requests.request("GET", next_url)
            retries += 1
            logger.debug(f'Retries: {retries}')
            
            if 'data' in response.json().keys():
                success = True
                output = response
            
            elif 'error' in response.json().keys():
                continue
            
            else:
                logger.debug(f'No response. Retry from {next_url} next time.')
                ResponseError(f'Data not in response after {retries}. Quitting.')
    
    return output

In [7]:
def json_responses2df(chunks: 'list'):
    df_list = [pd.DataFrame(chunk) for chunk in chunks]
    return pd.concat(df_list)

In [8]:
mps = pd.read_csv('data/raw/parliament/MP_names_15_19.csv')

In [9]:
name2id = mps.loc[pd.notna(mps['page_id']), ['name', 'page_id']]
name2id['page_id'] = name2id['page_id'].astype(int).astype(str)

In [12]:
page_id_splits = [list(split) for split in np.array_split(name2id['page_id'], 23)]

In [16]:
def get_meta_ads(page_ids, return_fields, access_token, wait_on_rate_limit = 3600, limit = 1000):

    chunks = []
    pages = 0
    more_pages = False

    base_url = (
        "https://graph.facebook.com/v15.0/ads_archive?"
        f"access_token={META_TEMP_TOKEN}&"
        f"search_page_ids={page_ids}&"
        #f"search_terms=''&"
        "ad_type=POLITICAL_AND_ISSUE_ADS&"
        "ad_reached_countries=['DK']&"
        "ad_delivery_date_min=2018-05-07&" #2018-05-07
        f"fields={return_fields}&"
        f"limit={limit}"
    )

    # First run to retrieve cursor
    logger.info(f'Calling Meta API...')
    response = requests.request("GET", base_url)
    chunks.append(response.json())
    pages = update_page_status(pages)

    if 'paging' in chunks[0].keys():
        more_pages = True

    # Keep looping as long as there are more pages
    while more_pages:

        try:
            cursor = response.json()['paging']['cursors']['after']

            response = request_ads_handle_errors(
                base_url,
                cursor,
                wait_on_rate_limit,
                max_retries = 3
            )

            # The error-handled API call ensures that we
            # only append reponses with valid data fields
            chunks.append(response.json())
            pages = update_page_status(pages)

            # Sleep just in case -- multiple consecutive
            # requests empirically result in strange API
            # behaviour (undocumented burst limit)?
            time.sleep(2)

        # If unable to access cursor key, assume that the
        # last page has been collected and break out of
        # the loop, printing last data payload
        except KeyError:
            logger.info('Last page reached. All done!')
            more_pages = False

    return chunks

In [919]:
# TODO:
# Get parties and MPs inducted during period!!!!!!
# Make sure to get alternative text fields for carousel ads...
# Check if we get larger payloads if not specifyin empty search term

# Below we go on combining the collected splits

In [28]:
all_politicians = []

for i, page_ids in enumerate(page_id_splits):
    
    chunks = get_meta_ads(
        page_ids,
        return_fields,
        META_TEMP_TOKEN,
        wait_on_rate_limit = 3600,
        limit = 1000
    )
    
    chunks_unpacked = [chunk['data'] for chunk in chunks]
    all_politicians.extend(chunks_unpacked)

2022-10-23 14:46:44.354 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 3

2022-10-23 14:46:55.028 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:46:55.030 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:47:00.013 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:47:00.015 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:47:06.999 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:47:07.000 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:47:11.082 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:47:11.083 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:47:16.727 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:47:16.729 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:47:20.858 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:47:20.859 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 3

2022-10-23 14:47:30.893 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:47:30.894 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:47:34.647 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:47:34.648 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:47:37.517 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:47:37.519 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:47:44.171 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:47:44.172 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 3

2022-10-23 14:47:53.815 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:47:53.817 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:48:00.136 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:48:00.137 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:48:05.055 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:48:05.058 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:48:09.459 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:48:09.461 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:48:15.659 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:48:15.660 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 3

2022-10-23 14:48:26.294 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:48:26.296 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:48:31.798 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:48:31.800 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 3

2022-10-23 14:48:47.041 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:48:47.042 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 3

2022-10-23 14:48:57.540 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:48:57.543 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:49:01.375 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:49:01.377 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:49:05.575 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:49:05.577 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:49:10.184 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-23 14:49:10.185 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 2

2022-10-23 14:49:16.121 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!


In [33]:
politician_ads_df = json_responses2df(all_politicians)

In [43]:
politician_ads_df.shape

(16065, 18)

In [47]:
politician_ads_df.iloc[197]['ad_snapshot_url']

'https://www.facebook.com/ads/archive/render_ad/?id=478938860954115&access_token=EAB0JThpjjzABADZB590C8OiCR1DGXReMiztOZCJBoZCVuZA7yDKk3TEEqH86zL4zjEaiEl63nuJXJbJycdUH1dFTVU2S5otBsZA1ZBEO3q9ZBl0dj1BUYsEfezZAA0YWomq7EHGsjOk1Dkm7yeEr9cE2G0hbmYBAtHgNFeIddWIYlAZDZD'

In [48]:
with open(f'data/clean/meta/politician_ads_v1.p', 'wb') as p:    
    pickle.dump(politician_ads_df, p)

In [929]:
party_dict = {
    'Alternativet': '676061179094769',
    'Danmarksdemokraterne': '108041771950114',
    'Dansk Folkeparti': '520449347983427',
    'Enhedslisten': '223040066022',
    'Frie Grønne': '104695404710124',
    'Konservative': '39233495912',
    'Kristendemokraterne': '180658048629404',
    'Liberal Alliance': '106952222676974',
    'Moderaterne': '108273171366317',
    'Nye Borgerlige': '646746285466931',
    'Radikale Venstre': '12458150929',
    'SF': '74796954245',
    'Socialdemokratiet': '41459763029',
    'Venstre': '21465928829'
}

In [None]:
# Split once again in order not to exceed permitted no. of IDs per API call
party_id_splits = [list(v) for v in np.array_split(list(party_dict.values()), 2)]

In [960]:
both_tables = []

for split in party_id_splits:

    chunks = get_meta_ads(
        page_ids = split,
        return_fields = return_fields,
        access_token = META_TEMP_TOKEN
    )

    chunks_unpacked = [chunk['data'] for chunk in chunks]
    df = json_responses2df(chunks_unpacked)

    both_tables.append(df)

2022-10-22 23:10:07.909 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 5

2022-10-22 23:10:35.928 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!
2022-10-22 23:10:36.005 | INFO     | __main__:get_meta_ads:20 - Calling Meta API...


Pages collected: 9

2022-10-22 23:11:35.423 | INFO     | __main__:get_meta_ads:55 - Last page reached. All done!


In [965]:
party_ads = pd.concat(both_tables)

In [966]:
party_ads

Unnamed: 0,page_id,page_name,bylines,ad_creative_bodies,ad_delivery_start_time,ad_delivery_stop_time,ad_snapshot_url,demographic_distribution,delivery_by_region,estimated_audience_size,publisher_platforms,spend,currency,impressions,id,ad_creative_link_captions,ad_creative_link_titles,ad_creative_link_descriptions
0,39233495912,Det Konservative Folkeparti,Det Konservative Folkeparti,[Vi skylder vores børn en grønnere verden! 💚\n...,2022-10-12,2022-10-17,https://www.facebook.com/ads/archive/render_ad...,"[{'percentage': '0.000113', 'age': '18-24', 'g...","[{'percentage': '0.25926', 'region': 'Central ...",{'lower_bound': '1000001'},[facebook],"{'lower_bound': '5000', 'upper_bound': '5999'}",DKK,"{'lower_bound': '150000', 'upper_bound': '1749...",612062180591821,,,
1,39233495912,Det Konservative Folkeparti,Det Konservative Folkeparti,[Det her har vi diskuteret alt for længe 👇\n\n...,2022-10-07,2022-10-14,https://www.facebook.com/ads/archive/render_ad...,"[{'percentage': '0.001011', 'age': '25-34', 'g...","[{'percentage': '0.241245', 'region': 'Central...","{'lower_bound': '500001', 'upper_bound': '1000...",[facebook],"{'lower_bound': '5000', 'upper_bound': '5999'}",DKK,"{'lower_bound': '150000', 'upper_bound': '1749...",617047266824389,,,
2,39233495912,Det Konservative Folkeparti,Det Konservative Folkeparti,[Vores ældre fortjener en værdig alderdom.\n\n...,2022-10-05,2022-10-14,https://www.facebook.com/ads/archive/render_ad...,"[{'percentage': '0.003019', 'age': '65+', 'gen...","[{'percentage': '0.244393', 'region': 'Central...","{'lower_bound': '500001', 'upper_bound': '1000...",[facebook],"{'lower_bound': '500', 'upper_bound': '599'}",DKK,"{'lower_bound': '25000', 'upper_bound': '29999'}",446322100749756,[konservative.dk],[Du fortjener en værdig alderdom efter et lang...,
3,39233495912,Det Konservative Folkeparti,Det Konservative Folkeparti,[Vores ældre fortjener en værdig alderdom.\n\n...,2022-10-05,2022-10-14,https://www.facebook.com/ads/archive/render_ad...,"[{'percentage': '0.023848', 'age': '55-64', 'g...","[{'percentage': '0.25044', 'region': 'Central ...","{'lower_bound': '500001', 'upper_bound': '1000...",[facebook],"{'lower_bound': '400', 'upper_bound': '499'}",DKK,"{'lower_bound': '20000', 'upper_bound': '24999'}",871921477117885,[konservative.dk],[Du fortjener en værdig alderdom efter et lang...,
4,39233495912,Det Konservative Folkeparti,Det Konservative Folkeparti,[Vores børn skal gå en røgfri fremtid i møde.\...,2022-10-05,2022-10-14,https://www.facebook.com/ads/archive/render_ad...,"[{'percentage': '0.004535', 'age': '65+', 'gen...","[{'percentage': '0.255952', 'region': 'Central...","{'lower_bound': '500001', 'upper_bound': '1000...",[facebook],"{'lower_bound': '0', 'upper_bound': '99'}",DKK,"{'lower_bound': '1000', 'upper_bound': '1999'}",1119192425343430,[konservative.dk],"[Der findes ingen sygdomme i Danmark, som slår...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736,74796954245,SF,,[Den danske natur er fantastisk! Men vi skal h...,2018-05-17,2018-05-23,https://www.facebook.com/ads/archive/render_ad...,"[{'percentage': '0.002937', 'age': '18-24', 'g...","[{'percentage': '0.000528', 'region': 'Unknown...",,,"{'lower_bound': '100', 'upper_bound': '199'}",DKK,"{'lower_bound': '4000', 'upper_bound': '4999'}",366735390851431,,,
737,74796954245,SF,,[Den danske natur er fantastisk. Men vi skal h...,2018-05-17,2018-05-23,https://www.facebook.com/ads/archive/render_ad...,"[{'percentage': '0.003027', 'age': '18-24', 'g...","[{'percentage': '0.000341', 'region': 'Unknown...",,,"{'lower_bound': '200', 'upper_bound': '299'}",DKK,"{'lower_bound': '3000', 'upper_bound': '3999'}",309108026436856,,,
738,74796954245,SF,,[Den danske natur er fantastisk! Men vi skal h...,2018-05-17,2018-05-23,https://www.facebook.com/ads/archive/render_ad...,"[{'percentage': '0.051916', 'age': '18-24', 'g...","[{'percentage': '0.21201', 'region': 'Central ...",,,"{'lower_bound': '0', 'upper_bound': '99'}",DKK,"{'lower_bound': '0', 'upper_bound': '999'}",2097782130298398,,,
739,74796954245,SF,,[Den danske natur er fantastisk. Men vi skal h...,2018-05-17,2018-05-23,https://www.facebook.com/ads/archive/render_ad...,"[{'percentage': '0.000985', 'age': '18-24', 'g...","[{'percentage': '0.294747', 'region': 'Central...",,,"{'lower_bound': '200', 'upper_bound': '299'}",DKK,"{'lower_bound': '5000', 'upper_bound': '5999'}",404462130101559,,,


In [850]:
ad_splits = []

for i in range(0, 22+1):
    
    with open(f'data/raw/parliament/splits/split{i}.p', 'rb') as p:
        
        ad_split = pickle.load(p)
        ad_splits.append(ad_split)

In [853]:
politician_ads = pd.concat(ad_splits).reset_index(drop = True)

In [868]:
mps['page_id'] = mps['page_id'].astype('Int64').astype(str)
politician_ads['page_id'].astype(str)

0         132364993978468
1        1719764131664489
2        1719764131664489
3        1719764131664489
4        1719764131664489
               ...       
15951     127344360615074
15952     127344360615074
15953     127344360615074
15954     673183069445618
15955     673183069445618
Name: page_id, Length: 15956, dtype: object

In [881]:
politician_ads_merged = politician_ads.merge(mps, on = 'page_id', how = 'left')

In [915]:
politician_ads_merged.iloc[1695]['ad_snapshot_url']

'https://www.facebook.com/ads/archive/render_ad/?id=312786937708736&access_token=EAB0JThpjjzABADZB590C8OiCR1DGXReMiztOZCJBoZCVuZA7yDKk3TEEqH86zL4zjEaiEl63nuJXJbJycdUH1dFTVU2S5otBsZA1ZBEO3q9ZBl0dj1BUYsEfezZAA0YWomq7EHGsjOk1Dkm7yeEr9cE2G0hbmYBAtHgNFeIddWIYlAZDZD'

In [997]:
with open(f'data/raw/parliament/splits/party_ads_v1.p', 'wb') as p:    
    pickle.dump(party_ads, p)