In [55]:
import json
import pandas as pd
from collections import defaultdict
from io import StringIO
from zip_archive import ZipArchive

In [56]:
DATASET = "../data/google-political-ads-transparency-bundle.zip"
ARCHIVE = ZipArchive(DATASET)

In [85]:
class GoogleDataset(object):
    GEO_SPEND = "google-political-ads-transparency-bundle/google-political-ads-geo-spend.csv"
    ADVERTISER_STATS = "google-political-ads-transparency-bundle/google-political-ads-advertiser-stats.csv"
    WEEKLY_SPEND = "google-political-ads-transparency-bundle/google-political-ads-advertiser-weekly-spend.csv"
    ADS_STATS = "google-political-ads-transparency-bundle/google-political-ads-creative-stats.csv"
    CAMP_TARGETS = "google-political-ads-transparency-bundle/google-political-ads-campaign-targeting.csv"
    
    
    def __init__(self, dataset_filepath, filters):
        self._archive = ZipArchive(dataset_filepath)
        self._filters = filters
        
    def _load_dataframe(self, ds):
        data = ARCHIVE.get(ds)
        df = pd.read_csv(StringIO(data))
        return df
    
    def advertisers(self, region):
        stats_df = self._load_dataframe(self.ADVERTISER_STATS)
        stats = json.loads(df.to_json(orient='records'))
        advertisers = list(df[df['Regions'].str.contains(region)]['Advertiser_Name'])
        return advertisers
    
    def generate_dataset(self):
        
        ads_df = self._load_dataframe(self.ADS_STATS)
        stats_df = self._load_dataframe(self.ADVERTISER_STATS)
        weekly_df = self._load_dataframe(self.WEEKLY_SPEND)
        
        dataset = {}
        
        for label, term in self._filters.items():
            df = ads_df[ads_df['Advertiser_Name'].str.contains(term)]
            ads = json.loads(df.to_json(orient='records'))
            advertisers = [ x['Advertiser_Name'] for x in ads ]
            
            df = stats_df[stats_df['Advertiser_Name'].str.contains(term)]
            stats = json.loads(df.to_json(orient='records'))
            total_spending = sum([ x['Spend_EUR'] for x in stats])
            
            weekly_spending = defaultdict(int)
            df = weekly_df[weekly_df['Advertiser_Name'].str.contains(term)]
            weekly = json.loads(df.to_json(orient='records'))
            for w in weekly:
                weekly_spending[w['Week_Start_Date']] += w['Spend_EUR']
            #weekly_spending = { x['Week_Start_Date']: x['Spend_EUR'] for x in weekly}
            
            dataset[label] = {
                'advertisers': list(set(advertisers)),
                'total_spending': total_spending,
                'weekly_spending': dict(weekly_spending),
                'ads': ads 
            }
            print(label, term)
        return dataset
    

In [86]:
gd = GoogleDataset(DATASET, { 'FPÖ': 'Freih', 'SPÖ': 'SPÖ' })

In [91]:
gd.advertisers('AT')

['Sozialdemokratische Partei Österreichs (SPÖ)',
 'SPÖ-Landesorganisation Burgenland',
 'Sozialdemokratische Partei Österreichs (SPÖ)',
 'Freiheitliche Partei Österreich']