In [60]:
import os
import csv
import json
from io import StringIO
from zipfile import ZipFile
from os.path import join
from collections import defaultdict, Counter
from glob import glob
from heapq import nsmallest, nlargest

In [50]:
REPORTS = '../data/facebook/reports'
NULL_DISCLAIMER = 'These ads ran without a disclaimer'

COUNTRIES = [
    'AT',
    'BE',
    'BG',
    'CY',
    'CZ',
    'DE',
    'DK',
    'EE',
    'ES',
    'FI',
    'FR',
    'GB',
    'GR',
    'HR',
    'HU',
    'IE',
    'IT',
    'LT',
    'LU',
    'LV',
    'MT',
    'NL',
    'PL',
    'PT',
    'RO',
    'SE',
    'SI',
    'SK',
    'US'
]

CURRENCY_RATES = {
    'EUR': 1,
    'USD': 0.88,
    'CZK': 0.039,
    'DKK': 0.134,
    'GBP': 1.12,
    'HUF': 0.0031,
    'PLN': 0.235,
    'RON': 0.21,
    'SEK': 0.095
}

In [54]:
# Finding unique advertisers and the country they operate in
ADVERTISERS = {}

for country in COUNTRIES:
    folder = join(REPORTS, country)
    
    last_date = sorted(d for d, _, _, in os.walk(folder))[-1].split('/')[-1]
    zip_path = glob(join(REPORTS, country, last_date, '*_lifelong.zip'))[0]

    with ZipFile(zip_path) as zipfile:
        csv_entry = next(e for e in zipfile.infolist() if e.filename.endswith('.csv'))
        
        with zipfile.open(csv_entry.filename) as csv_f:
            csv_io = StringIO(csv_f.read().decode('utf-8-sig'))
            reader = csv.DictReader(csv_io)
            currency_field = next(f for f in reader.fieldnames if f.startswith('Amount Spent'))
            currency = currency_field.split('(')[-1].split(')')[0]
            
            for line in reader:
                disclaimer = line['Disclaimer']
                page_name = line['Page Name']
                page_id = line.get('Page ID')
                spent = int(line[currency_field].replace('≤', ''))
                ads = int(line['Number of Ads in Library'].replace('≤', ''))
                
                has_disclaimer = disclaimer != NULL_DISCLAIMER
                advertiser = disclaimer if has_disclaimer else page_name
                
                record = ADVERTISERS.get(advertiser)

                if record is None:
                    record = {
                        'disclaimer': disclaimer,
                        'page_name': page_name,
                        'page_id': page_id,
                        'advertiser': advertiser,
                        'advertiser_kind': 'disclaimer' if has_disclaimer else 'page',
                        'countries': set(),
                        'spent': Counter(),
                        'spent_euro': Counter(),
                        'ads': Counter()
                    }
                    ADVERTISERS[advertiser] = record
                    
                record['countries'].add(country)
                record['spent'][country] += spent
                record['ads'][country] += ads
                record['spent_euro'][country] += spent * CURRENCY_RATES[currency]

In [59]:
class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        
        return json.JSONEncoder.default(self, obj)

with open('../international_facebook_advertising.json', 'w') as jsonf:
    json.dump(ADVERTISERS, jsonf, ensure_ascii=False, cls=CustomJSONEncoder)

In [82]:
def sum_spent(a):
    return sum(a['spent_euro'].values())

def sum_ads(a):
    return sum(a['ads'].values())

top = nlargest(100, (a for a in ADVERTISERS.values() if len(a['countries']) > 1), key=sum_spent)

for a in top:
    print(a['advertiser'], sum_spent(a), len(a['countries']))
    a['sum_spent'] = sum_spent(a)
    a['sum_ads'] = sum_ads(a)
    a['nb_countries'] = len(a['countries'])
#    for c in a['countries']:
#        print('  %s' % c)

with open('../top-advertisers.json', 'w') as jsonf:
    json.dump(top, jsonf, ensure_ascii=False, cls=CustomJSONEncoder, indent=2)

Facebook 10964686.29 11
Care2 4573409.52 2
International Rescue Committee 3448097.32 7
European Parliament 3169695.9619 28
Ben & Jerry's 1032531.2799999999 4
USA for UNHCR 975856.48 8
Friends of the Earth 767723.36 2
TOMS 520587.12 3
Wholesome Culture 411828.41 27
Patagonia 395254.32 2
the Liberal Democrats 347292.93 18
The Labour Party 307466.07999999996 2
Kialo 269836.01 29
SPÖ 184029 2
European Greens 164995.26 27
Seventh Generation 158535.075 2
Giving Brush LLC 155713.92 2
Uniunea Salvați România - USR 127391.70599999998 11
Centerpartiet 126970.665 2
Avaaz 120383.48 6
Audible 120183.6 2
European Commission 113961.78 27
The Years Project 88072.03 21
MasterClass 76189.05 28
Kristdemokraterna 72596.94000000002 2
Clone Evolution 55043.36 2
Momentum Mozgalom 53657.31740000001 2
Unilever 52412.0 3
Parti Socialiste (PS) 52118 2
Grüner Fisher Investments 51014 2
Campact e.V. 49700 2
Sperry 49355.6 4
Giving Brush 41406.31 21
Doha Debates 40992.4 3
EPP - European People's Party 40874.37 27
P

In [100]:
PAIRS = Counter()

for a in ADVERTISERS.values():
    countries = list(a['countries'])
    
    if len(countries) < 2:
        continue

    for i in range(len(countries)):
        A = countries[i]
        for j in range(i + 1, len(countries)):
            B = countries[j]
            
            if A > B:
                A, B = B, A
            
            PAIRS[(A, B)] += 1
            
for pair, count in PAIRS.most_common(15):
    print(pair, count)

('AT', 'NL') 5682
('AT', 'GR') 5635
('AT', 'IE') 5595
('AT', 'CZ') 5472
('AT', 'LU') 5193
('BE', 'BG') 2659
('BE', 'ES') 2553
('BE', 'IT') 2478
('BE', 'HU') 2457
('AT', 'BE') 2439
('BE', 'SE') 2433
('BE', 'PL') 2430
('BE', 'RO') 2415
('BE', 'PT') 2414
('BE', 'MT') 2371


In [101]:
parliament = ADVERTISERS['European Parliament']
parliament['spent_euro'].most_common(10)

[('DE', 706045),
 ('FR', 579401),
 ('IT', 370328),
 ('ES', 281531),
 ('BE', 162253),
 ('NL', 153432),
 ('GB', 110867.68000000002),
 ('FI', 110262),
 ('AT', 98155),
 ('GR', 83411)]