# Custom Data Folder â€” Basic Info

This notebook prints basic statistics and info about the `custom_data` folder (JSON files).

In [1]:
import json
from pathlib import Path
from collections import Counter

DATA_DIR = Path("custom_data")
assert DATA_DIR.is_dir(), f"Folder not found: {DATA_DIR}"

json_files = sorted(DATA_DIR.glob("*.json"))
n_files = len(json_files)
print(f"Number of JSON files: {n_files}")
print(f"Total size (bytes): {sum(f.stat().st_size for f in json_files):,}")
print(f"Folder path: {DATA_DIR.absolute()}")

Number of JSON files: 95
Total size (bytes): 688,629
Folder path: /Users/nityaakalra/Desktop/nyt_topic_modeling/topic_modeling_paper/custom_data


In [2]:
# Load all JSONs and collect basic stats
records = []
content_lengths = []
all_keys = set()
sources = []
query_types = []
countries = []
success_flags = []
parse_errors = []

for p in json_files:
    try:
        with open(p, encoding="utf-8") as f:
            obj = json.load(f)
    except (json.JSONDecodeError, OSError) as e:
        parse_errors.append((p.name, str(e)))
        continue
    
    all_keys.update(obj.keys())
    content = (obj.get("content") or "").strip()
    content_lengths.append(len(content))
    records.append({
        "file": p.name,
        "_id": obj.get("_id"),
        "content_len": len(content),
        "word_count": len(content.split()) if content else 0,
        "has_content": bool(content),
    })
    sources.append(obj.get("source", ""))
    query_types.append(obj.get("query_type", ""))
    countries.append(obj.get("country", ""))
    success_flags.append(obj.get("success", None))

print(f"Records loaded: {len(records)}")
if parse_errors:
    print(f"Parse errors: {len(parse_errors)}")
    for name, err in parse_errors[:5]:
        print(f"  - {name}: {err}")

Records loaded: 95


In [3]:
# Content length stats
if content_lengths:
    total_chars = sum(content_lengths)
    n_empty = sum(1 for c in content_lengths if c == 0)
    n_non_empty = len(content_lengths) - n_empty
    print("Content field (per document)")
    print(f"  Total characters: {total_chars:,}")
    print(f"  Min length: {min(content_lengths):,} chars")
    print(f"  Max length: {max(content_lengths):,} chars")
    print(f"  Mean length: {total_chars / len(content_lengths):,.1f} chars")
    print(f"  Documents with empty content: {n_empty}")
    print(f"  Documents with non-empty content: {n_non_empty}")
    word_counts = [r["word_count"] for r in records]
    print(f"  Total words (non-empty docs): {sum(word_counts):,}")
    print(f"  Mean words per doc: {sum(word_counts) / len(word_counts):,.1f}")

Content field (per document)
  Total characters: 612,160
  Min length: 0 chars
  Max length: 10,000 chars
  Mean length: 6,443.8 chars
  Documents with empty content: 16
  Documents with non-empty content: 79
  Total words (non-empty docs): 91,234
  Mean words per doc: 960.4


In [4]:
# Keys present in JSONs
print("Keys found in JSON objects:", sorted(all_keys))

Keys found in JSON objects: ['_id', 'content', 'country', 'query', 'query_type', 'source', 'success', 'themes', 'timestamp', 'tone', 'url', 'web_title']


In [5]:
# Categorical breakdowns
print("By source:")
for k, v in Counter(sources).most_common():
    print(f"  {k or '(missing)'}: {v}")
print("\nBy query_type:")
for k, v in Counter(query_types).most_common():
    print(f"  {k or '(missing)'}: {v}")
print("\nBy country:")
for k, v in Counter(countries).most_common():
    print(f"  {k or '(missing)'}: {v}")
print("\nBy success:")
for k, v in Counter(success_flags).most_common():
    print(f"  {k}: {v}")

By source:
  bing: 28
  yandex: 20
  mojeek: 18
  google: 18
  seznam: 10
  _id
6639047e54af489acbedbb12    google
6639047e54af489acbedbb12    google
Name: SearchEngineName, dtype: object: 1

By query_type:
  anti: 28
  lgbt: 27
  gender: 11
  international_anti.local: 10
  international_anti.splc: 9
  relationships: 8
  _id
6639047e54af489acbedbb12    relationships
6639047e54af489acbedbb12    relationships
Name: BaseCondition, dtype: object: 1
  disinformation: 1

By country:
  cz: 45
  de: 39
  fr: 10
  _id
6639047e54af489acbedbb12    cz
6639047e54af489acbedbb12    cz
Name: Country, dtype: object: 1

By success:
  True: 80
  False: 15


In [6]:
# Sample record (first non-empty content if any, else first)
sample = next((r for r in records if r["has_content"]), records[0] if records else None)
if sample:
    print("Sample record:", sample)
else:
    print("No records.")

# First JSON full keys and types (optional)
if json_files:
    with open(json_files[0], encoding="utf-8") as f:
        first = json.load(f)
    print("\nFirst file keys and value types:")
    for k, v in first.items():
        t = type(v).__name__
        preview = repr(v)[:60] + "..." if len(repr(v)) > 60 else repr(v)
        print(f"  {k}: {t} = {preview}")

Sample record: {'file': '6625f01e2c78c92c8b6a2377.json', '_id': '6625f01e2c78c92c8b6a2377', 'content_len': 9818, 'word_count': 1359, 'has_content': True}

First file keys and value types:
  _id: str = '6624dc3cd22b08910cfa4aa7'
  source: str = 'mojeek'
  url: str = 'https://jungefreiheit.de/politik/deutschland/2020/gauland-s...
  timestamp: str = '2026-02-06 13:42:05.877132'
  content: str = ''
  web_title: str = 'Gauland spricht sich gegen 8. Mai als Feiertag aus'
  query: str = 'alexander gauland spricht die wahrheit'
  query_type: str = 'anti'
  country: str = 'de'
  success: bool = False
