In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
import os
%matplotlib inline

plt.rcParams['font.family'] = 'serif'
plt.rcParams["font.serif"] = ["Nimbus Roman"]

In [38]:
codes_file = '../data/ad_codes_transformed.tsv'
codes = pd.read_csv(codes_file, sep='\t')
adid_codes = dict(zip(codes['adid'], [r.split(';') for r in codes['codes']]))

In [39]:
targeting_params = pd.read_csv('/home/piotr/targeting_params.csv', low_memory=False)
# get rid of the Nigerian and Indian ads
targeting_params = targeting_params.loc[targeting_params['location'] != "{'serialized': {'location_granularity': 'country', 'location_geo_type': 'home', 'location_code': 'NG'}, 'location_name': 'Nigeria', 'location_type': 'HOME'}"]
targeting_params = targeting_params.loc[targeting_params['location'] != "{'serialized': {'location_granularity': 'country', 'location_geo_type': 'home', 'location_code': 'IN'}, 'location_name': 'India', 'location_type': 'HOME'}"]

In [40]:
maus = {row['adid']: row['estimate_mau'] for idx, row in targeting_params.iterrows()}


In [41]:
data = []
for adid, adcodes in adid_codes.items():
    row = {code: 1 for code in adcodes}
    row['adid'] = adid
    try:
        row['estimate_mau'] = maus[adid]
    except:
        row['estimate_mau'] = float('nan')
    data.append(row)
        
data = pd.DataFrame(data)

In [42]:
combined = targeting_params.set_index('adid').join(data.set_index('adid'), how='inner', lsuffix='l')

In [43]:


combined['age_min_explicit'] = combined['age_min'].map(lambda x: 6 if pd.isna(x) else x)
combined['age_max_explicit'] = combined['age_max'].map(lambda x: 54 if pd.isna(x) else x)

# Are ads with neutral targeting skewed?

## Find all the ads that used default targeting

In [44]:
# count all defaults as fraction of labeled
labeled = ((combined['Benign']==1) | (combined['Clickbait']==1) | (combined['Financial']==1) | \
                  (combined['Opportunity']==1) | (combined['Healthcare']==1) | (combined['Sensitive']==1) | \
                  (combined['Potentially Prohibited']==1) | (combined['Potentially Harmful']==1))

labeled_count = labeled.sum()


combined['default'] = (combined['age_min_explicit']==6)\
                & (combined['age_max_explicit']==54)\
                & ((combined['WAISTUILocaleType'].isin(["{'locales': [24, 6]}","{'locales': [6]}"])) \
                       | combined['WAISTUILocaleType'].isna()) \
                & combined['WAISTUICustomAudienceType'].isna() \
                & combined['WAISTUIBCTType'].isna() \
                & combined['WAISTUIEduStatusType'].isna() \
                & combined['WAISTUIDPAType'].isna() \
                & combined['WAISTUIActionableInsightsType'].isna() \
                & combined['WAISTUIFriendsOfConnectionType'].isna()\
                & combined['WAISTUIRelationshipType'].isna() \
                & combined['WAISTUIWorkEmployerType'].isna() \
                & combined['WAISTUILocalReachType'].isna() \
                & combined['WAISTUIEduSchoolsType'].isna() \
                & combined['WAISTUIConnectionType'].isna() \
                & combined['WAISTUICollaborativeAdsStoreVisitsType'].isna() \
                & combined['WAISTUIJobTitleType'].isna() \
                & combined['WAISTUIInterestsType'].isna() \
                & combined['gender'].isna() \
                & labeled
                

combined['default'].sum()/labeled_count

0.22405643225947203

In [103]:
combined['default']

default_ads = pd.DataFrame(combined.loc[combined['default'], 
                           ['Benign', 'Clickbait', 'Financial', 'Opportunity', 'Healthcare',
                           'Sensitive', 'Potentially Prohibited', 'Potentially Harmful']])

## Get user attributes


In [46]:
users = pd.read_csv('../data/ALLDEMS.csv')
users.drop_duplicates(keep=False, inplace=True)
users.set_index('pid', inplace=True)

## Match ads to users¶


In [47]:
frequencies = pd.read_csv('../data/participant_ad_freqs.tsv', delimiter='\t')
freqs = defaultdict(set)
for idx, row in frequencies.iterrows():
    freqs[row['adid']].add(row['pid'])

In [48]:
default_ads.fillna(0, inplace=True)

## put it all together¶


In [49]:
regression_table = []
missing_users = set()
for idx, row in default_ads.iterrows():
    for user in freqs[str(idx)]:
        try:
            temp = dict(row)
            urow = users.loc[user.strip()]
            pid = urow.name
            temp.update(dict(users.loc[user.strip()]))
            temp['pid'] = pid
            regression_table.append(temp)
        except KeyError:
            missing_users.add(user.strip())
            
regression_table = pd.DataFrame(regression_table)
pids = regression_table['pid'].unique()

In [50]:
# set numerical pids for mixed effects
regression_table['pid'] = regression_table['pid'].map(lambda x: np.where(pids==x)[0][0])


In [53]:
regression_table

Unnamed: 0,Benign,Clickbait,Financial,Opportunity,Healthcare,Sensitive,Potentially Prohibited,Potentially Harmful,woman,man,black,white,hispanic,asian,older,high_ed,high_income,pid,harmful,prohibited
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0,0,1,0,0,0,0,1,0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,1,0,0,1,0,0,1,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1,0,0,0,0,0,0,2,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,1,1,0,0,1,0,3,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,1,0,0,0,0,1,4,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8990,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,1,0,0,174,0.0,0.0
8991,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,1,0,0,1,0,1,119,0.0,0.0
8992,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,1,0,0,174,0.0,0.0
8993,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,1,0,0,1,1,0,105,0.0,0.0


In [51]:
import statsmodels.formula.api as smf
variables = ['woman', 'black',  'asian', 'hispanic','older', 'high_ed']
regression_table['harmful'] = regression_table['Potentially Harmful']
regression_table['prohibited'] = regression_table['Potentially Prohibited']

targets = ['Healthcare','Opportunity','Sensitive', 'Financial', 
                          'Clickbait',  'harmful',  'prohibited',]

for target in targets:
    print(f"{target} ~ {' + '.join(variables)}")
    model_logit = smf.logit(formula=f"{target} ~ {' + '.join(variables)} + (1|pid)", 
                            data=regression_table)
    res = model_logit.fit()
    print(res.summary())
    
    
    

Healthcare ~ woman + black + asian + hispanic + older + high_ed
Optimization terminated successfully.
         Current function value: 0.322498
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:             Healthcare   No. Observations:                 8995
Model:                          Logit   Df Residuals:                     8987
Method:                           MLE   Df Model:                            7
Date:                Wed, 01 Feb 2023   Pseudo R-squ.:                0.006083
Time:                        18:06:14   Log-Likelihood:                -2900.9
converged:                       True   LL-Null:                       -2918.6
Covariance Type:            nonrobust   LLR p-value:                 8.979e-06
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -1.7601      0.112    -15.758 

### Export aggregate counts for default ads to R
Ingredients: `default_ads`, `codes`, `frequencies`

In [68]:
active_pids = open('../data/pid_active_contribs_cleaned.txt', 'r').readlines()
active_pids = [p.strip() for p in active_pids]
frequencies = frequencies[frequencies['pid'].isin(active_pids)]

In [86]:
codes_ind = codes.set_index('adid')
default_ads_ids = set(default_ads.index)

# drop None rows and convert adid to int for join
frequencies = frequencies[frequencies.adid.str.isnumeric()]    
frequencies['adid'] = frequencies.adid.astype('int64')

In [97]:
export = frequencies.merge(codes_ind, how='inner', on='adid')
# only keep ads that were default targeted
export = export[export['adid'].isin(default_ads_ids)]

# NOTE: export actually has more rows than len(default_ads_ids) because of (pid, adid) repeats

In [128]:
# export pid, adid, total_default, sensitive_default, clickbait_default, ...
tocount = ['Benign', 'Healthcare', 'Opportunity', 'Sensitive', 'Financial', 'Clickbait', 'Potentially Prohibited', 'Potentially Harmful']

export_df = []
for pid in active_pids:
    # pid, default_total
    filerow = [pid, export[export['pid'] == pid].shape[0]]
    # counts for all other default targeted ads
    for c in tocount:
        count = export[(export['pid'] == pid) & (export['codes'] == c)]['frequency'].sum()
        filerow.append(count)
        
    export_df.append(filerow)
        
header = ['pid', 'total_default',
          'neutral_default', 'healthcare_default', 'opportunity_default',
          'sensitive_default', 'financial_default', 'clickbait_default',
          'prohibited_default', 'deceptive_default']
export_df = pd.DataFrame(export_df, columns=header)
export_df.to_csv('../data/default_targeting_regression_counts.csv', index=False)

In [129]:
export_df.head()

Unnamed: 0,pid,total_default,neutral_default,healthcare_default,opportunity_default,sensitive_default,financial_default,clickbait_default,prohibited_default,deceptive_default
0,978584,56,44,7,3,0,2,1,5,0
1,950668,347,1021,129,73,0,108,15,4,5
2,5dee72917309b950e5262f3a,82,95,17,0,1,2,1,0,0
3,5f3d6975ef131710ed519c7a,1,0,0,1,0,0,0,0,0
4,131788,19,19,2,1,1,1,1,0,0
