In [55]:
import json
import pandas as pd
from collections import defaultdict
from io import StringIO
from zip_archive import ZipArchive

In [56]:
DATASET = "../data/google-political-ads-transparency-bundle.zip"
ARCHIVE = ZipArchive(DATASET)

In [85]:
class GoogleDataset(object):
    GEO_SPEND = "google-political-ads-transparency-bundle/google-political-ads-geo-spend.csv"
    ADVERTISER_STATS = "google-political-ads-transparency-bundle/google-political-ads-advertiser-stats.csv"
    WEEKLY_SPEND = "google-political-ads-transparency-bundle/google-political-ads-advertiser-weekly-spend.csv"
    ADS_STATS = "google-political-ads-transparency-bundle/google-political-ads-creative-stats.csv"
    CAMP_TARGETS = "google-political-ads-transparency-bundle/google-political-ads-campaign-targeting.csv"
    
    
    def __init__(self, dataset_filepath, filters):
        self._archive = ZipArchive(dataset_filepath)
        self._filters = filters
        
    def _load_dataframe(self, ds):
        data = ARCHIVE.get(ds)
        df = pd.read_csv(StringIO(data))
        return df
    
    def advertisers(self, region):
        stats_df = self._load_dataframe(self.ADVERTISER_STATS)
        stats = json.loads(df.to_json(orient='records'))
        advertisers = list(df[df['Regions'].str.contains(region)]['Advertiser_Name'])
        return advertisers
    
    def generate_dataset(self):
        
        ads_df = self._load_dataframe(self.ADS_STATS)
        stats_df = self._load_dataframe(self.ADVERTISER_STATS)
        weekly_df = self._load_dataframe(self.WEEKLY_SPEND)
        
        dataset = {}
        
        for label, term in self._filters.items():
            df = ads_df[ads_df['Advertiser_Name'].str.contains(term)]
            ads = json.loads(df.to_json(orient='records'))
            advertisers = [ x['Advertiser_Name'] for x in ads ]
            
            df = stats_df[stats_df['Advertiser_Name'].str.contains(term)]
            stats = json.loads(df.to_json(orient='records'))
            total_spending = sum([ x['Spend_EUR'] for x in stats])
            
            weekly_spending = defaultdict(int)
            df = weekly_df[weekly_df['Advertiser_Name'].str.contains(term)]
            weekly = json.loads(df.to_json(orient='records'))
            for w in weekly:
                weekly_spending[w['Week_Start_Date']] += w['Spend_EUR']
            #weekly_spending = { x['Week_Start_Date']: x['Spend_EUR'] for x in weekly}
            
            dataset[label] = {
                'advertisers': list(set(advertisers)),
                'total_spending': total_spending,
                'weekly_spending': dict(weekly_spending),
                'ads': ads 
            }
            print(label, term)
        return dataset
    

In [86]:
gd = GoogleDataset(DATASET, { 'FPÖ': 'Freih', 'SPÖ': 'SPÖ' })

In [90]:
gd.advertisers('FI')

['Janika Takatalo',
 'Vapaus ry',
 'Uusmaalaisten puolesta ry',
 'Arnora Oy',
 'Saku Petteri Räty',
 'Varsinais-Suomen Kokoomus ry',
 'Kansallinen Kokoomus r.p.',
 'Kankaanpään Työväenyhdistys ry',
 'Maailman Paras Pääkaupunki ry',
 'Jukka Kopra',
 'Yhteinen hyvä ry',
 'Hämeen Uusi Suunta ry',
 'Teknologia ja tulevaisuus ry',
 'Savo-Karjalan vihreät ry',
 'Rumilus Design Oy',
 'Tampereen Vihreät ry',
 'Petteri Rädyn tukiyhdistys ry',
 'Suomen Keskusta r.p.',
 'Liberaalipuolue - Vapaus valita r.p.',
 'Suomen Sosialidemokraattinen Puolue - Finlands Socialdemokratiska  Parti r.p.',
 'Perussuomalaiset r.p., Sannfinländarna r.p.',
 'Pirkanmaan Kokoomus ry',
 'Työlinjalla ry']

In [81]:
ds = gd.generate_dataset()

FPÖ Freih
SPÖ SPÖ


In [None]:
ds.

In [63]:
ds['SPÖ']

{'advertisers': ['SPÖ-Landesorganisation Burgenland',
  'Sozialdemokratische Partei Österreichs (SPÖ)'],
 'total_spending': 5100,
 'weekly_spending': {'2019-04-14': 1200,
  '2019-04-21': 2050,
  '2019-03-17': 0,
  '2019-03-31': 450,
  '2019-04-07': 650},
 'ads': [{'Ad_ID': 'CR219787426590097408',
   'Ad_URL': 'https://transparencyreport.google.com/political-ads/library/advertiser/AR327974972716744704/creative/CR219787426590097408',
   'Ad_Type': 'Video',
   'Regions': 'AT, EU',
   'Advertiser_ID': 'AR327974972716744704',
   'Advertiser_Name': 'Sozialdemokratische Partei Österreichs (SPÖ)',
   'Ad_Campaigns_List': 'CA446616675400286208',
   'Date_Range_Start': '2019-04-19',
   'Date_Range_End': '2019-04-29',
   'Num_of_Days': 11,
   'Impressions': '100k-1M',
   'Spend_USD': None,
   'Spend_Range_Min_USD': 1000,
   'Spend_Range_Max_USD': 50000.0,
   'Spend_Range_Min_EUR': 500,
   'Spend_Range_Max_EUR': 30000.0,
   'Spend_Range_Min_INR': 2500,
   'Spend_Range_Max_INR': 125000.0,
   'Spend

In [64]:
df = gd._load_dataframe(gd.ADVERTISER_STATS)

In [74]:
len(ds['FPÖ']['ads'])

51

In [75]:
len(ds['SPÖ']['ads'])

48

In [72]:
sum([ x['Num_of_Days'] for x in ds['FPÖ']['ads'] ])

391

In [73]:
sum([ x['Num_of_Days'] for x in ds['SPÖ']['ads'] ])

505

In [84]:
df = gd._load_dataframe(gd.ADVERTISER_STATS)
df.columns

Index(['Advertiser_ID', 'Advertiser_Name', 'Public_IDs_List', 'Regions',
       'Elections', 'Total_Creatives', 'Spend_USD', 'Spend_EUR', 'Spend_INR',
       'Spend_BGN', 'Spend_HRK', 'Spend_CZK', 'Spend_DKK', 'Spend_HUF',
       'Spend_PLN', 'Spend_RON', 'Spend_SEK', 'Spend_GBP'],
      dtype='object')

In [None]:
df[df['Advertiser_Name'].contains()]