In [1]:
import yaml
import requests
import csv
import re
from tqdm import tqdm
from itertools import product

with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

with open('TOKEN.txt') as f:
    token = f.readlines()

assert config['search_total'] % config['page_total'] == 0, \
    "search_total should be a multiple of page_total."



In [3]:
config

{'page_total': 10,
 'search_total': 100,
 'ad_active_status': 'ALL',
 'search_terms': '.',
 'query_fields': ['ad_creation_time',
  'ad_creative_body',
  'ad_creative_link_caption',
  'ad_creative_link_description',
  'ad_creative_link_title',
  'ad_delivery_start_time',
  'ad_delivery_stop_time',
  'ad_snapshot_url',
  'demographic_distribution',
  'funding_entity',
  'impressions',
  'page_id',
  'page_name',
  'region_distribution',
  'spend',
  'currency'],
 'output_fields': ['ad_id',
  'page_id',
  'page_name',
  'ad_creative_body',
  'ad_creative_link_caption',
  'ad_creative_link_description',
  'ad_creative_link_title',
  'ad_delivery_start_time',
  'ad_delivery_stop_time',
  'funding_entity',
  'impressions_min',
  'impressions_max',
  'spend_min',
  'spend_max',
  'ad_url',
  'currency'],
 'demo_fields': ['ad_id', 'age', 'gender', 'percentage'],
 'region_fields': ['ad_id', 'region', 'percentage'],
 'demo_ages': ['18-24', '25-34', '35-44', '45-54', '55-64', '65+'],
 'demo_gende

In [4]:
params = {
    'access_token': token,
    'ad_type': 'POLITICAL_AND_ISSUE_ADS',
    'ad_reached_countries': "['NL']",
    'ad_active_status': config['ad_active_status'],
    'search_terms': config.get('search_terms'),
    'search_page_ids': ",".join(config.get('search_page_ids', [])),
    'fields': ",".join(config['query_fields']),
    'limit': config['page_total']
}

REGIONS = set(config['regions'])
DEMOS = set(product(config['demo_ages'], config['demo_genders']))



In [9]:
f1 = open('data/fb_ads.csv', 'w')
w1 = csv.DictWriter(f1, fieldnames=config['output_fields'],
                    extrasaction='ignore')
w1.writeheader()

f2 = open('data/fb_ads_demos.csv', 'w')
w2 = csv.DictWriter(f2, fieldnames=config['demo_fields'],
                    extrasaction='ignore')
w2.writeheader()

f3 = open('data/fb_ads_regions.csv', 'w')
w3 = csv.DictWriter(f3, fieldnames=config['region_fields'],
                    extrasaction='ignore')
w3.writeheader()



25

In [10]:
pbar = tqdm(total=config['search_total'], smoothing=0)



  0%|          | 0/100 [00:00<?, ?it/s]

In [14]:
int(config['search_total'] / config['page_total'])

10

In [18]:
r = requests.get('https://graph.facebook.com/v11.0/ads_archive',
                 params=params)

In [19]:
data = r.json()

In [31]:
if data['error']['code'] == 190:
    print('ERROR', data['error'])

ERROR {'message': 'Error validating access token: Session has expired on Saturday, 17-Jul-21 16:00:00 PDT. The current time is Monday, 19-Jul-21 05:55:27 PDT.', 'type': 'OAuthException', 'code': 190, 'error_subcode': 463, 'fbtrace_id': 'A6pbxxpJ89klEEWMoNxxyJF'}


In [30]:
data

{'error': {'message': 'Error validating access token: Session has expired on Saturday, 17-Jul-21 16:00:00 PDT. The current time is Monday, 19-Jul-21 05:55:27 PDT.',
  'type': 'OAuthException',
  'code': 190,
  'error_subcode': 463,
  'fbtrace_id': 'A6pbxxpJ89klEEWMoNxxyJF'}}

In [28]:
data

{'error': {'message': 'Error validating access token: Session has expired on Saturday, 17-Jul-21 16:00:00 PDT. The current time is Monday, 19-Jul-21 05:55:27 PDT.',
  'type': 'OAuthException',
  'code': 190,
  'error_subcode': 463,
  'fbtrace_id': 'A6pbxxpJ89klEEWMoNxxyJF'}}

In [33]:
r = requests.get('https://graph.facebook.com/v11.0/ads_archive',params=params)
data = r.json()
if data['error']['code'] == 190:
    print('ERROR', data['error'])
else:
    for _ in range(int(config['search_total'] / config['page_total'])):
        r = requests.get('https://graph.facebook.com/v11.0/ads_archive',
                         params=params)
        data = r.json()
        for ad in data['data']:
            # The ad_id is encoded in the ad snapshot URL
            # and cannot be accessed as a normal field. (?!?!)

            ad_id = re.search(r'\d+', ad['ad_snapshot_url']).group(0)
            ad_url = 'https://www.facebook.com/ads/library/?id=' + ad_id

            # write to the unnested files
            demo_set = set()
            for demo in ad['demographic_distribution']:
                demo.update({'ad_id': ad_id})
                w2.writerow(demo)
                demo_set.add((demo['age'], demo['gender']))

            # Impute a percentage of 0
            # for demos with insufficient data
            unused_demos = DEMOS - demo_set
            for demo in unused_demos:
                w2.writerow({
                    'ad_id': ad_id,
                    'age': demo[0],
                    'gender': demo[1],
                    'percentage': 0
                })

            region_set = set()
            for region in ad['region_distribution']:
                region.update({'ad_id': ad_id})
                w3.writerow(region)
                region_set.add(region['region'])

            # Impute a percentage of 0
            # for states with insufficient data
            unused_regions = REGIONS - region_set
            for region in unused_regions:
                w3.writerow({
                    'ad_id': ad_id,
                    'region': region,
                    'percentage': 0
                })

            ad.update({'ad_id': ad_id,
                       'ad_url': ad_url,
                       'impressions_min': ad['impressions']['lower_bound'],
                       'impressions_max': ad['impressions']['upper_bound'],
                       'spend_min': ad['spend']['lower_bound'],
                       'spend_max': ad['spend']['upper_bound'],
                       })

            w1.writerow(ad)
            pbar.update()

        # if we have scraped all the ads, exit
        if 'paging' not in data:
            break

        params.update({'after': data['paging']['cursors']['after']})

    f1.close()
    f2.close()
    f3.close()
    pbar.close()


ERROR {'message': 'Error validating access token: Session has expired on Saturday, 17-Jul-21 16:00:00 PDT. The current time is Monday, 19-Jul-21 06:01:04 PDT.', 'type': 'OAuthException', 'code': 190, 'error_subcode': 463, 'fbtrace_id': 'AQdhKx29w-LY3V7h-HAVR4E'}
