In [16]:
# 01 - Imports & path helpers
from pathlib import Path
import json, itertools
import pandas as pd

DATA_DIR = Path("..") / "data" / "raw"
RAW = sorted(DATA_DIR.glob("full_*.ndjson"))[-1]   # latest pull
RAW

PosixPath('../data/raw/full_2025-05-26_1136.ndjson')

In [17]:
# 02 - Peek at first 3 lines
with RAW.open() as fh:
    sample = [json.loads(next(fh)) for _ in range(3)]
sample[0]


{'itemId': 'v1|235398389941|536188920244',
 'title': 'Apple iPhone 12, 64/128/256GB -  Unlocked - Used Good - All colors',
 'itemGroupHref': 'https://api.ebay.com/buy/browse/v1/item/get_items_by_item_group?item_group_id=235398389941',
 'leafCategoryIds': ['9355'],
 'categories': [{'categoryId': '9355',
   'categoryName': 'Cell Phones & Smartphones'},
  {'categoryId': '15032', 'categoryName': 'Cell Phones & Accessories'}],
 'image': {'imageUrl': 'https://i.ebayimg.com/images/g/t4wAAOSwg4hkCgdI/s-l225.jpg'},
 'price': {'value': '209.99', 'currency': 'USD'},
 'itemGroupType': 'SELLER_DEFINED_VARIATIONS',
 'itemHref': 'https://api.ebay.com/buy/browse/v1/item/v1%7C235398389941%7C536188920244',
 'seller': {'username': 'electronicdea1s',
  'feedbackPercentage': '98.0',
  'feedbackScore': 10796},
 'marketingPrice': {'originalPrice': {'value': '219.99', 'currency': 'USD'},
  'discountPercentage': '5',
  'discountAmount': {'value': '10.00', 'currency': 'USD'}},
 'condition': 'Used',
 'conditionI

In [18]:
# 03 - Flatten *one* record quickly to eyeball nested keys
def walk(d, prefix=""):
    for k, v in d.items():
        key = f"{prefix}.{k}" if prefix else k
        if isinstance(v, dict):
            yield from walk(v, key)
        else:
            yield key, v

flat_keys = list(walk(sample[0]))
flat_keys[:40]          # first bunch


[('itemId', 'v1|235398389941|536188920244'),
 ('title',
  'Apple iPhone 12, 64/128/256GB -  Unlocked - Used Good - All colors'),
 ('itemGroupHref',
  'https://api.ebay.com/buy/browse/v1/item/get_items_by_item_group?item_group_id=235398389941'),
 ('leafCategoryIds', ['9355']),
 ('categories',
  [{'categoryId': '9355', 'categoryName': 'Cell Phones & Smartphones'},
   {'categoryId': '15032', 'categoryName': 'Cell Phones & Accessories'}]),
 ('image.imageUrl',
  'https://i.ebayimg.com/images/g/t4wAAOSwg4hkCgdI/s-l225.jpg'),
 ('price.value', '209.99'),
 ('price.currency', 'USD'),
 ('itemGroupType', 'SELLER_DEFINED_VARIATIONS'),
 ('itemHref',
  'https://api.ebay.com/buy/browse/v1/item/v1%7C235398389941%7C536188920244'),
 ('seller.username', 'electronicdea1s'),
 ('seller.feedbackPercentage', '98.0'),
 ('seller.feedbackScore', 10796),
 ('marketingPrice.originalPrice.value', '219.99'),
 ('marketingPrice.originalPrice.currency', 'USD'),
 ('marketingPrice.discountPercentage', '5'),
 ('marketingPri

In [19]:
# 04 - Frequency of all top-level keys (100-row slice)
from collections import Counter
cnt = Counter()
with RAW.open() as fh:
    for line in itertools.islice(fh, 0, 100):
        cnt.update(json.loads(line).keys())
cnt

Counter({'itemId': 100,
         'title': 100,
         'leafCategoryIds': 100,
         'categories': 100,
         'price': 100,
         'itemHref': 100,
         'seller': 100,
         'condition': 100,
         'conditionId': 100,
         'buyingOptions': 100,
         'itemWebUrl': 100,
         'itemLocation': 100,
         'adultOnly': 100,
         'legacyItemId': 100,
         'availableCoupons': 100,
         'itemOriginDate': 100,
         'itemCreationDate': 100,
         'topRatedBuyingExperience': 100,
         'priorityListing': 100,
         'listingMarketplaceId': 100,
         'snapshot_ts': 100,
         'image': 99,
         'thumbnailImages': 99,
         'shippingOptions': 99,
         'epid': 85,
         'itemGroupHref': 76,
         'itemGroupType': 76,
         'additionalImages': 68,
         'marketingPrice': 37,
         'pickupOptions': 1})

In [23]:
# 05 - Decide on columns to keep
KEEP = [
    "itemId", "title", "condition", "categoryId",
    "soldDate", "soldPrice", "seller", "listingFormat",
    "image", "itemLocation", "buyingOptions",
]
KEEP

['itemId',
 'title',
 'condition',
 'categoryId',
 'soldDate',
 'soldPrice',
 'seller',
 'listingFormat',
 'image',
 'itemLocation',
 'buyingOptions']