General Info for the NYTimes API

In [2]:
import json
from pprint import pprint
with open("response.json", "r") as f:
    response = json.load(f)

pprint(response.keys())
print(f"Status: {response['status']}")
print(f"Copyright: {response['copyright']}")
print(f"Response docs: {response['response']['docs'][0].keys()}")
len(response["response"]["docs"])



dict_keys(['status', 'copyright', 'response'])
Status: OK
Copyright: Copyright (c) 2025 The New York Times Company. All Rights Reserved.
Response docs: dict_keys(['abstract', 'web_url', 'snippet', 'lead_paragraph', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'subsection_name', 'byline', 'type_of_material', '_id', 'word_count', 'uri'])


10

In [6]:
one_doc = response["response"]["docs"][0]
pprint(one_doc.keys())
print(f"Headline: {one_doc['headline']['main']}")
print(f"Snippet: {one_doc['snippet']}")
print(f"Abstract: {one_doc['abstract']}")
print(f"URL: {one_doc['web_url']}")
flat_str_fields = []
nested_fields = []
other_fields = []
for key, value in one_doc.items():
    if isinstance(value, str):
        print(f"{key}: {len(value)} chars")
        flat_str_fields.append(key)
    elif isinstance(value, list):
        print(f"{key}: {len(value)} items")
        nested_fields.append(key)
    elif isinstance(value, dict):
        print(f"{key}: {len(value)} keys")
        nested_fields.append(key)
    else:
        other_fields.append(key)
        print(f"{key}: {type(value)}")

pprint(flat_str_fields)
pprint(nested_fields)
pprint(other_fields)    





dict_keys(['abstract', 'web_url', 'snippet', 'lead_paragraph', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'subsection_name', 'byline', 'type_of_material', '_id', 'word_count', 'uri'])
Headline: No, Mark Zuckerberg Did Not Fund Anti-Trump Lawsuit, Spokesman Says
Snippet: A right-wing influencer has argued that Mr. Zuckerberg was indirectly funding Protect Democracy, which is backing a lawsuit brought by labor unions against President Trump.
Abstract: A right-wing influencer has argued that Mr. Zuckerberg was indirectly funding Protect Democracy, which is backing a lawsuit brought by labor unions against President Trump.
URL: https://www.nytimes.com/2025/02/14/us/politics/zuckerberg-protect-democracy-funding.html
abstract: 172 chars
web_url: 88 chars
snippet: 172 chars
lead_paragraph: 127 chars
source: 18 chars
multimedia: 75 items
headline: 7 keys
keywords: 8 items
pub_date: 24 chars
document_type: 7 chars
news_desk: 10 char

In [10]:
print(nested_fields)
headline = one_doc["headline"]
pprint(headline.keys())
print(f"Headline: {headline['main']}")
print(f"Print: {headline['print_headline']}")
print(f"Kicker: {headline['kicker']}")
print(f"Content Kicker: {headline['content_kicker']}")
print(f"Name: {headline['name']}")
print(f"Seo: {headline['seo']}")
print(f"Sub: {headline['sub']}")



['multimedia', 'headline', 'keywords', 'byline']
dict_keys(['main', 'kicker', 'content_kicker', 'print_headline', 'name', 'seo', 'sub'])
Headline: No, Mark Zuckerberg Did Not Fund Anti-Trump Lawsuit, Spokesman Says
Print: None
Kicker: None
Content Kicker: None
Name: None
Seo: None
Sub: None


In [12]:
keywords = one_doc["keywords"]
len(keywords)
# pprint(keywords)
for keyword in keywords:
    print(f"Keyword: {keyword['name']}")
    print(f"Value: {keyword['value']}")
    print(f"Rank: {keyword['rank']}")
    print(f"Major: {keyword['major']}")

Keyword: organizations
Value: Silicon Valley Community Foundation
Rank: 1
Major: N
Keyword: subject
Value: United States Politics and Government
Rank: 2
Major: N
Keyword: organizations
Value: Protect Democracy (Nonprofit)
Rank: 3
Major: N
Keyword: organizations
Value: Government Efficiency Department (US)
Rank: 4
Major: N
Keyword: persons
Value: Zuckerberg, Mark E
Rank: 5
Major: N
Keyword: persons
Value: Chan, Priscilla
Rank: 6
Major: N
Keyword: persons
Value: Trump, Donald J
Rank: 7
Major: N
Keyword: subject
Value: Nonprofit Organizations
Rank: 8
Major: N


In [15]:
byline = one_doc["byline"]
pprint(byline)
pprint(byline.keys())
print(f"Original: {byline['original']}")
print(f"Person: {byline['person'][0]['firstname']}")
# print(f"Org: {byline['organization'][0]['name']}")



{'organization': None,
 'original': 'By Theodore Schleifer',
 'person': [{'firstname': 'Theodore',
             'lastname': 'Schleifer',
             'middlename': None,
             'organization': '',
             'qualifier': None,
             'rank': 1,
             'role': 'reported',
             'title': None}]}
dict_keys(['original', 'person', 'organization'])
Original: By Theodore Schleifer
Person: Theodore


In [18]:
multimedia = one_doc["multimedia"]
print(f"Multimedia: {len(multimedia)}")
pprint(multimedia)
for item in multimedia:
    print(f"Caption: {item['caption']}")
    print(f"Credit: {item['credit']}")
    print(f"Url: {item['url']}")


Multimedia: 75
[{'caption': None,
  'credit': None,
  'crop_name': 'articleLarge',
  'height': 400,
  'legacy': {'xlarge': 'images/2025/02/14/multimedia/14trump-news-zuckerberg-doge-tjzg/14trump-news-zuckerberg-doge-tjzg-articleLarge.jpg',
             'xlargeheight': 400,
             'xlargewidth': 600},
  'rank': 0,
  'subType': 'xlarge',
  'subtype': 'xlarge',
  'type': 'image',
  'url': 'images/2025/02/14/multimedia/14trump-news-zuckerberg-doge-tjzg/14trump-news-zuckerberg-doge-tjzg-articleLarge.jpg',
  'width': 600},
 {'caption': None,
  'credit': None,
  'crop_name': 'popup',
  'height': 433,
  'legacy': {},
  'rank': 0,
  'subType': 'popup',
  'subtype': 'popup',
  'type': 'image',
  'url': 'images/2025/02/14/multimedia/14trump-news-zuckerberg-doge-tjzg/14trump-news-zuckerberg-doge-tjzg-popup.jpg',
  'width': 650},
 {'caption': None,
  'credit': None,
  'crop_name': 'blog480',
  'height': 320,
  'legacy': {},
  'rank': 0,
  'subType': 'blog480',
  'subtype': 'blog480',
  'type'

In [19]:
from helpers import flatten_dict

flattened_dict = flatten_dict(one_doc)
pprint(flattened_dict)


{'_id': 'nyt://article/234a9c27-4054-502a-a9c1-72195f5a8250',
 'abstract': 'A right-wing influencer has argued that Mr. Zuckerberg was '
             'indirectly funding Protect Democracy, which is backing a lawsuit '
             'brought by labor unions against President Trump.',
 'byline.organization': None,
 'byline.original': 'By Theodore Schleifer',
 'byline.person.0.firstname': 'Theodore',
 'byline.person.0.lastname': 'Schleifer',
 'byline.person.0.middlename': None,
 'byline.person.0.organization': '',
 'byline.person.0.qualifier': None,
 'byline.person.0.rank': 1,
 'byline.person.0.role': 'reported',
 'byline.person.0.title': None,
 'document_type': 'article',
 'headline.content_kicker': None,
 'headline.kicker': None,
 'headline.main': 'No, Mark Zuckerberg Did Not Fund Anti-Trump Lawsuit, '
                  'Spokesman Says',
 'headline.name': None,
 'headline.print_headline': None,
 'headline.seo': None,
 'headline.sub': None,
 'keywords.0.major': 'N',
 'keywords.0.name': 'o

In [21]:
flattened_multimedia = flatten_dict(multimedia)
pprint(flattened_multimedia)



{'0.caption': None,
 '0.credit': None,
 '0.crop_name': 'articleLarge',
 '0.height': 400,
 '0.legacy.xlarge': 'images/2025/02/14/multimedia/14trump-news-zuckerberg-doge-tjzg/14trump-news-zuckerberg-doge-tjzg-articleLarge.jpg',
 '0.legacy.xlargeheight': 400,
 '0.legacy.xlargewidth': 600,
 '0.rank': 0,
 '0.subType': 'xlarge',
 '0.subtype': 'xlarge',
 '0.type': 'image',
 '0.url': 'images/2025/02/14/multimedia/14trump-news-zuckerberg-doge-tjzg/14trump-news-zuckerberg-doge-tjzg-articleLarge.jpg',
 '0.width': 600,
 '1.caption': None,
 '1.credit': None,
 '1.crop_name': 'popup',
 '1.height': 433,
 '1.rank': 0,
 '1.subType': 'popup',
 '1.subtype': 'popup',
 '1.type': 'image',
 '1.url': 'images/2025/02/14/multimedia/14trump-news-zuckerberg-doge-tjzg/14trump-news-zuckerberg-doge-tjzg-popup.jpg',
 '1.width': 650,
 '10.caption': None,
 '10.credit': None,
 '10.crop_name': 'blog225',
 '10.height': 150,
 '10.rank': 0,
 '10.subType': 'blog225',
 '10.subtype': 'blog225',
 '10.type': 'image',
 '10.url': '

In [22]:
flattened_keywords = flatten_dict(keywords)
pprint(flattened_keywords)



{'0.major': 'N',
 '0.name': 'organizations',
 '0.rank': 1,
 '0.value': 'Silicon Valley Community Foundation',
 '1.major': 'N',
 '1.name': 'subject',
 '1.rank': 2,
 '1.value': 'United States Politics and Government',
 '2.major': 'N',
 '2.name': 'organizations',
 '2.rank': 3,
 '2.value': 'Protect Democracy (Nonprofit)',
 '3.major': 'N',
 '3.name': 'organizations',
 '3.rank': 4,
 '3.value': 'Government Efficiency Department (US)',
 '4.major': 'N',
 '4.name': 'persons',
 '4.rank': 5,
 '4.value': 'Zuckerberg, Mark E',
 '5.major': 'N',
 '5.name': 'persons',
 '5.rank': 6,
 '5.value': 'Chan, Priscilla',
 '6.major': 'N',
 '6.name': 'persons',
 '6.rank': 7,
 '6.value': 'Trump, Donald J',
 '7.major': 'N',
 '7.name': 'subject',
 '7.rank': 8,
 '7.value': 'Nonprofit Organizations'}


In [36]:
flattened_whole_doc = flatten_dict(one_doc)
pprint(flattened_whole_doc)


flattened_whole_response = flatten_dict(response)
pprint(flattened_whole_response)
print(len(flattened_whole_response))


{'_id': 'nyt://article/234a9c27-4054-502a-a9c1-72195f5a8250',
 'abstract': 'A right-wing influencer has argued that Mr. Zuckerberg was '
             'indirectly funding Protect Democracy, which is backing a lawsuit '
             'brought by labor unions against President Trump.',
 'byline.organization': None,
 'byline.original': 'By Theodore Schleifer',
 'byline.person.0.firstname': 'Theodore',
 'byline.person.0.lastname': 'Schleifer',
 'byline.person.0.middlename': None,
 'byline.person.0.organization': '',
 'byline.person.0.qualifier': None,
 'byline.person.0.rank': 1,
 'byline.person.0.role': 'reported',
 'byline.person.0.title': None,
 'document_type': 'article',
 'headline.content_kicker': None,
 'headline.kicker': None,
 'headline.main': 'No, Mark Zuckerberg Did Not Fund Anti-Trump Lawsuit, '
                  'Spokesman Says',
 'headline.name': None,
 'headline.print_headline': None,
 'headline.seo': None,
 'headline.sub': None,
 'keywords.0.major': 'N',
 'keywords.0.name': 'o

'copyright': 'Copyright (c) 2025 The New York Times Company. All Rights '
              'Reserved.',
 'response.docs.0._id': 'nyt://article/234a9c27-4054-502a-a9c1-72195f5a8250',
 'response.docs.0.abstract': 'A right-wing influencer has argued that Mr. '
                             'Zuckerberg was indirectly funding Protect '
                             'Democracy, which is backing a lawsuit brought by '
                             'labor unions against President Trump.',
 'response.docs.0.byline.organization': None,
 'response.docs.0.byline.original': 'By Theodore Schleifer',
 'response.docs.0.byline.person.0.firstname': 'Theodore',
 'response.docs.0.byline.person.0.lastname': 'Schleifer',
 'response.docs.0.byline.person.0.middlename': None,
 'response.docs.0.byline.person.0.organization': '',
 'response.docs.0.byline.person.0.qualifier': None,
 'response.docs.0.byline.person.0.rank': 1,
 'response.docs.0.byline.person.0.role': 'reported',
 'response.docs.0.byline.person.0.title': 

In [39]:
schema = set()
for key, value in flattened_whole_doc.items():

    if ("0" not in key) and ("1" not in key) and ("2" not in key) and ("3" not in key) and ("4" not in key) and ("5" not in key) and ("6" not in key) and ("7" not in key) and ("8" not in key) and ("9" not in key):
        print(key)
        schema.add(key)

print(len(schema))


abstract
web_url
snippet
lead_paragraph
source
headline.main
headline.kicker
headline.content_kicker
headline.print_headline
headline.name
headline.seo
headline.sub
pub_date
document_type
news_desk
section_name
subsection_name
byline.original
byline.organization
type_of_material
_id
word_count
uri
23


In [34]:
pprint(schema)

{'copyright',
 'response.meta.hits',
 'response.meta.offset',
 'response.meta.time',
 'status'}


In [None]:
for key, value in flattened_whole_doc.items():
    if "." not in key:
        print(key)
    if isinstance(value, str):
        if "Community" in value:
            print(value)
            schema.add(key)

print(len(schema))

abstract
web_url
snippet
lead_paragraph
source


TypeError: argument of type 'int' is not iterable