# Analyzing OpenRTB Bids

This notebook analyzes bid request/response pairs from OpenRTB data to extract placement IDs with their corresponding bid prices from each demand source. The following fields are commonly used to identify placement IDs in OpenRTB:

1. `imp.ext.gpid` - Global Placement ID
2. `imp.tagid` - Tag ID
3. `imp.ext.data.pbadslot` - Publisher Ad Slot
4. `imp.id` - Impression ID
5. `imp.ext.dfp_id` - DFP Ad Unit Code

We'll extract these placement identifiers along with the bid prices for each demand source.

In [None]:
try:
    import pandas as pd
except ImportError:
    import sys
    !{sys.executable} -m pip install pandas

In [None]:
import json
import os
import re
from glob import glob
import urllib.parse
from src.bid_analysis import get_request_response_dataframe, extra_bids_from_response

df = get_request_response_dataframe('../data/bids/*.json')
# Parse 'body_req' JSON and extract 'imp' array, then explode into separate rows
df['imps'] = df['body_req'].apply(lambda x: json.loads(x).get('imp', None))
# df = df.explode('imp').reset_index(drop=True)
df['bids'] = df['body_resp'].apply(lambda x: extra_bids_from_response(json.loads(x if x and isinstance(x, str) else '{}')))

# df[df['requestId'] == '26424.687']['body_resp'].values[0]
no_bid_df = df[(df['body_resp'].notna()) & (df['bids'].isna()) & (df['body_resp'].apply(lambda x: isinstance(x, str) and len(x) > 200))]

output_root_dir = '../data/no_bids_found/'
os.makedirs(output_root_dir, exist_ok=True)
err_output_dir = os.path.join(output_root_dir, 'errors')
os.makedirs(err_output_dir, exist_ok=True)
sync_output_dir = os.path.join(output_root_dir, 'syncs')
os.makedirs(sync_output_dir, exist_ok=True)
nobid_output_dir = os.path.join(output_root_dir, 'no_bids')
os.makedirs(nobid_output_dir, exist_ok=True)

for _, row in no_bid_df.iterrows():   
    body = json.loads(row['body_resp'])
    body = body if isinstance(body, dict) else {}
    pretty_json = json.dumps(body, indent=2, ensure_ascii=False)

    out_dir = nobid_output_dir
    try:
        if 'sync' in body.get('ext', {}) or 'csUrl' in body.get('ext', {}):
            out_dir = sync_output_dir
        elif 'errors' in body.get('ext', {}):
            out_dir = err_output_dir
    except Exception:
        raise Exception('Error determining output directory for requestId {}'.format(row['requestId']))
    
    file_path = os.path.join(out_dir, f"{row['requestId']}.json")
    
    with open(file_path, 'w', encoding='utf-8') as f:
        # Pretty format the JSON string before writing
        try:
            f.write(pretty_json)
        except Exception:
            f.write(row['body_resp'])


In [None]:
def combine_imp_bid(row):
    imps = row['imps'] if isinstance(row['imps'], list) else []
    bids = row['bids'] if isinstance(row['bids'], list) else []
    bid_map = {bid.get('impid'): bid for bid in bids if 'impid' in bid}
    combined = []
    for imp in imps:
        imp_id = imp.get('id')
        bid = bid_map.get(imp_id, bid_map.get(str(imp_id)))
        combined.append({'impid': str(imp_id), 'imp': imp, 'bid': bid})
    return combined

df['imp_bids'] = df.apply(combine_imp_bid, axis=1)
df = df.explode('imp_bids').reset_index(drop=True)
df['impid'] = df['imp_bids'].apply(lambda x: x.get('impid') if isinstance(x, dict) else None)
df['imp'] = df['imp_bids'].apply(lambda x: x.get('imp') if isinstance(x, dict) else None)
df['bid'] = df['imp_bids'].apply(lambda x: x.get('bid') if isinstance(x, dict) else None)
#df = df.drop(columns=['imps', 'bids', 'imp_bids'])
df

In [None]:
from src.bid_analysis import extract_placement_id, extract_demand_source_from_nurl, extract_win_notice_url

df['placement_id'] = df['imp'].apply(lambda x: extract_placement_id(x) if isinstance(x, dict) else None)
df['bid_price'] = df['bid'].apply(lambda x: x.get('price') if isinstance(x, dict) else None)
df['bid_currency'] = df['body_resp'].apply(lambda x: json.loads(x).get('cur') if isinstance(x, str) else None)
df['advertiser_domain'] = df['bid'].apply(lambda x: ','.join(x.get('adomain')) if isinstance(x, dict) and x.get('adomain') else None)
df['demand_source'] = df['bid'].apply(lambda x: extract_demand_source_from_nurl(x) if isinstance(x, dict) else None)
df['document_domain'] = df['documentURL'].apply(lambda x: urllib.parse.urlparse(x).netloc if isinstance(x, str) else None)
df['bidder_domain'] = df['url_req'].apply(lambda x: urllib.parse.urlparse(x).netloc if isinstance(x, str) else None)
df['nurl']  = df['bid'].apply(lambda x: extract_win_notice_url(x) if isinstance(x, dict) else None)
df['nurl_domain'] = df['nurl'].apply(lambda x: urllib.parse.urlparse(x).netloc if isinstance(x, str) else None)
df['lurl_domain'] = df['bid'].apply(lambda x: urllib.parse.urlparse(x.get('lurl')).netloc if isinstance(x, dict) and x.get('lurl') else None)

df[df['bid'].notna()][['requestId', 'impid', 'bid', 'imp', 'placement_id', 'bid_price', 'bid_currency', 'advertiser_domain', 'demand_source', 'nurl_domain', 'lurl_domain', 'document_domain', 'bidder_domain']]
#.groupby('placement_id').agg({'bid_price': ['mean', 'count']}).reset_index().sort_values(('bid_price', 'mean'), ascending=False)

In [None]:
df[df['nurl_domain'].notna()][['requestId', 'impid', 'bid', 'imp', 'placement_id', 'bid_price', 'bid_currency', 'advertiser_domain', 'demand_source', 'nurl_domain', 'lurl_domain', 'document_domain', 'bidder_domain']]

In [None]:
df[df['bid'].apply(lambda x: json.dumps(x).find('"win"') != -1)][['requestId', 'impid', 'body_resp', 'bid', 'imp', 'placement_id', 'bid_price', 'bid_currency', 'advertiser_domain', 'demand_source', 'url_req']]

In [None]:
df.groupby(['document_domain', 'bidder_domain', 'advertiser_domain', 'bid_currency']).agg({'bid_price': ['mean', 'max', 'min', 'count']}).reset_index().sort_values(('bid_price', 'mean'), ascending=False)

In [None]:
import json
requestId = '23504.50'
impid = 'urldecoder-org_siderail_right_1'
# print(json.dumps(json.loads(df[(df['requestId'] == requestId) & (df['impid'] == impid)]['body_resp'].values[0]), indent=2, ensure_ascii=False))
print(json.dumps(df[(df['requestId'] == requestId) & (df['impid'] == impid)]['bid'].values[0], indent=2, ensure_ascii=False))

In [None]:
print(df[(df['requestId'] == requestId) & (df['impid'] == impid)]['bid'].values[0].get('adm'))