In [1]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
plt.rcParams["font.family"] = "Helvetica"

In [2]:
pid_adid_file = '../../db-processing/surveys/survey-1/pid_adid_survey1.tsv'
pid_adid = pd.read_csv(pid_adid_file, sep='\t')

pid_adid_b2_file = '../../db-processing/surveys/survey-1/pid_adid_survey1_batch2.tsv'
pid_adid = pd.concat([pid_adid, pd.read_csv(pid_adid_b2_file, sep='\t')])

survey1_pids = set(pid_adid['pid'])
part_ad_counts = pid_adid.groupby('pid').count().to_dict()['adid']

In [5]:
len(set(pid_adid['adid']))

4173

In [3]:
codes_file = '../../db-processing/ad_codes.tsv'
codes = pd.read_csv(codes_file, sep='\t')

dems_file = '../../db-processing/participant_dems.tsv'
dems = pd.read_csv(dems_file, sep='\t')

gender_file = '../../db-processing/surveys/survey-1/participant_genders.csv'
gender = pd.read_csv(gender_file)
gender = dict(gender.values)

ages_file = '../../db-processing/surveys/survey-1/participant_ages.csv'
ages = pd.read_csv(ages_file)
ages = dict(ages.values)

dems = dems[dems['pid'].isin(survey1_pids)]
dems['gender'] = pd.Series(dtype='int')
for i in dems.index:
    pid = dems.loc[i]['pid']
    if pid in gender:
        dems.loc[i, 'gender'] = int(gender[pid])        

In [88]:
edu_vals = ['highschool', 'associate', 'some-college', 'college', 'gradschool']
HIGHED = edu_vals.index('college')

In [3]:
def count_code_props(codes, norm=True):
    # takes dict from adid -> codes and returns dict of code proportions
    counts = defaultdict(lambda: 0)
    for aid in codes:
        # in case of multiple codes, count each one -- essentially computing fraction of codes and not ads here
        for code in codes[aid].split(';'):
            counts[code] += 1
                
    if norm:
        return {c: counts[c]/sum(counts.values()) for c in counts}
    else:
        return counts

In [107]:
part_code_props = {}
part_code_counts = {}

for pid in survey1_pids:
    joiner = pid_adid[pid_adid['pid'] == pid]
    joined = joiner.merge(codes, how='inner', left_on='adid', right_on='adid')
    
    code_props = count_code_props(dict(joined[['adid', 'codes_angelica']].values), norm=True)
    part_code_props[pid] = code_props
    
    code_counts = count_code_props(dict(joined[['adid', 'codes_angelica']].values), norm=False)
    part_code_counts[pid] = code_counts

In [113]:
# output format:
# pid, prop_benign, prop_clickbait, ..., prop_n, gender, eth, income, age, edu
# lm(prop. something ~ older (y/n) + female (y/n) + black (y/n) + hispanic (y/n) + asian (y/n) + high ed (y/n)

code_order = ['Benign', 'Financial', 'Healthcare', 'Opportunity',
             'Potentially Harmful', 'Potentially Prohibited', 'Clickbait', 'CA Lawsuit', 'Political']

allrows = []
for i, pid in enumerate(survey1_pids):
    row = [pid]
    # 1. add code proportions
    for code in code_order:
        row.append(part_code_props[pid].get(code, 0))
    
    # 2. add dems
    demrow = dems[dems['pid'] == pid]
    if demrow.shape[0]:
        # older (y/n)
        row.append(int(ages[pid] >= 1980))
        if gender[pid] == 3:
            row.append(None)
        else:
            # female (y/n)
            row.append(int(gender[pid] == 2))
        # black (y/n)
        row.append(int(demrow['ethnicity'] == 'black'))
        # hispanic (y/n)
        row.append(int(demrow['ethnicity'] == 'hispanic'))
        # asian (y/n)
        row.append(int(demrow['ethnicity'] == 'asian'))
        # highed (y/n)
        row.append(int(demrow['education'].isin(['college', 'gradschool'])))
    else:
        print(i, pid)
        continue
        
    allrows.append(row)
        
df = pd.DataFrame(allrows,
                  columns=['id', 'p_benign', 'p_financial', 'p_healthcare', 'p_pportunity',
             'p_harmful', 'p_prohibited', 'p_clickbait', 'p_lawsuit', 'p_political',
               'older', 'female', 'black', 'hispanic', 'asian', 'high_ed'])

df.to_csv('regression_data.csv', index=False)

3 606df53de4b0550a5f42fefe


### Regression file for: does targeting improve relevance?

In [2]:
# load targetings
ad_targetings = {}
with open('../../db-processing/ad-targetings.tsv', 'r') as fh:
    for line in fh:
        ad_id, targeting = line.split('\t')
        targeting = json.loads(targeting.strip())
        ad_targetings[ad_id] = targeting
        
# load survey responses
survey_file = '../../db-processing/survey_responses.tsv'
survey = pd.read_csv(survey_file, sep='\t')

In [3]:
ca_advertisers = {}
# 'ca_owner_name': 'Klarna'

def get_targeting(ut, aid):
    # also updates global list of CA advertisers
    global ca_advertisers
    # there is always an el['__typename'] == 'WAISTUIAgeGenderType'
    all_targetings = {'id': aid}
    for el in ut['data']['waist_targeting_data']:
        if el['__typename'] == 'WAISTUIInterestsType':
            all_targetings['interests'] = set([i['name'] for i in el['interests']])
        elif el['__typename'] == 'WAISTUICustomAudienceType':
            ca_advertisers[aid] = el['dfca_data']['ca_owner_name']
            all_targetings['custom'] = True
        elif el['__typename'] == 'WAISTUILocationType':
            gran = json.loads(el['serialized_data'])['location_granularity']
            loc = el['location_name']
            all_targetings['location'] = {'loc': loc, 'gran': gran}
        elif el['__typename'] == 'WAISTUIAgeGenderType':
            all_targetings['age-gender'] = {
                'age_min': el['age_min'],
                'age_max': el['age_max'],
                'gender': el['gender']
            }        
        
    return all_targetings

In [13]:
# prepare regression file for targeting vs. relevance regression
allrows = []

for i in range(survey.shape[0]):
    pid = survey.iloc[i]['pid']
    adid = survey.iloc[i]['adid']
    relevance = int(survey.iloc[i]['relevance'])
    tar = get_targeting(ad_targetings[str(adid)], adid)    
    interests = 0
    if 'interests' in tar:
        interests = len(tar['interests'])
    custom = int('custom' in tar)    
    if 'location' in tar:
        loc = []
        for gran in ['city', 'region', 'country']:
            if gran in tar['location']['gran']:
                loc.append(1)
            else:
                loc.append(0)
    else:
        continue
    regrow = [pid, adid, relevance, interests, custom] + loc
    allrows.append(regrow)

df = pd.DataFrame(allrows,
              columns=['pid', 'adid', 'relevance', 'n_interests', 'custom', 'city', 'region', 'country'])

df.to_csv('relevance_v_targeting.csv', index=False)

In [6]:
tar

{'id': 23849374805000583,
 'age-gender': {'age_min': 6, 'age_max': 43, 'gender': 'ANY'},
 'location': {'loc': 'the United States', 'gran': 'country'}}