In [11]:
import requests
import json
import math
import pandas as pd
import spacy

In [30]:
# searching for "cinema" in New York between 1890 and 1963
url = 'https://chroniclingamerica.loc.gov/search/pages/results/?state=New+York&date1=1890&date2=1963&proxtext=cinema&x=20&y=8&dateFilterType=yearRange&rows=20&searchType=basic&format=json'
response = requests.get(url)
raw = response.text
results = json.loads(raw)

In [31]:
results.keys()

dict_keys(['totalItems', 'endIndex', 'startIndex', 'itemsPerPage', 'items'])

In [32]:
print(results['items'][0])

{'sequence': 58, 'county': ['New York'], 'edition': None, 'frequency': 'Daily', 'id': '/lccn/sn83030214/1917-02-04/ed-1/seq-58/', 'subject': ['New York (N.Y.)--Newspapers.', 'New York (State)--New York County.--fast--(OCoLC)fst01234953', 'New York (State)--New York.--fast--(OCoLC)fst01204333', 'New York County (N.Y.)--Newspapers.'], 'city': ['New York'], 'date': '19170204', 'title': 'New-York tribune. [volume]', 'end_year': 1924, 'note': ['Also available in digital format on the Library of Congress website.', 'Archived issues are available in digital format as part of the Library of Congress Chronicling America online collection.', 'Available on microfilm from University Microfilms International, and Recordak.', 'Evening ed.: Evening edition of the tribune, 1866.', 'Merged with: New York herald (New York, N.Y. : 1920); to form: New York herald, New York tribune.', 'Semiweekly ed.: New-York tribune (New York, N.Y. : 1866 : Semiweekly), 1866-<1899>.', 'Triweekly eds.: New-York tri-weekly

In [33]:
print('Total items:', results['totalItems'])
print('End index:', results['endIndex'])
print('Start index:', results['startIndex'])
print('Items per page:', results['itemsPerPage'])
print('Length and type of items:', len(results['items']), type(results['items']))

Total items: 2953
End index: 20
Start index: 1
Items per page: 20
Length and type of items: 20 <class 'list'>


In [34]:
total_pages = math.ceil(results['totalItems'] / results['itemsPerPage'])
print(total_pages)

148


In [35]:
data = []

In [38]:
start_date = '1890'
end_date = '1963'
search_term = 'cinema'
state = 'New York'

In [39]:
for i in range(1, 11):  # for sake of time I'm doing only 10, you will want to put total_pages+1
    url = (f'https://chroniclingamerica.loc.gov/search/pages/results/?state={state}&date1={start_date}'
           f'&date2={end_date}&proxtext={search_term}&x=16&y=8&dateFilterType=yearRange&rows=20'
           f'&searchType=basic&format=json&page={i}')  # f-string
    response = requests.get(url)
    raw = response.text
    print(f'page {i} status code:', response.status_code)  # checking for errors
    results = json.loads(raw)
    items_ = results['items']
    for item_ in items_:
        row_data = {}
        try:
          row_data['title'] = item_['title_normal']
        except:
          row_data['title'] = "none"
        try:
          row_data['city'] = item_['city']
        except:
          row_data['city'] = "none"
        try:
          row_data['date'] = item_['date']
        except:
          row_data['date'] = "none"
        try:
          row_data['raw_text'] = item_['ocr_eng']
        except:
          row_data['raw_text'] = 'none'
    data.append(row_data)

page 1 status code: 200
page 2 status code: 200
page 3 status code: 200
page 4 status code: 200
page 5 status code: 200
page 6 status code: 200
page 7 status code: 200
page 8 status code: 200
page 9 status code: 200
page 10 status code: 200


In [43]:
df = pd.DataFrame.from_dict(data)

In [44]:
df.head()

Unnamed: 0,title,city,date,raw_text
0,opinión.,[Los Angeles],19370117,none
1,opinión.,[Los Angeles],19321030,none
2,opinión.,[Los Angeles],19420104,none
3,opinión.,[Los Angeles],19380213,none
4,opinión.,[Los Angeles],19310308,none


In [45]:
df['date'] = pd.to_datetime(df['date'])

In [46]:
df = df.sort_values(by='date')

In [47]:
df.head()

Unnamed: 0,title,city,date,raw_text
17,new-york tribune.,[New York],1909-06-27,4\nE. B. BALDWINS PLAN TO DRIFT ACROSS THE ARC...
10,sun and the new york herald.,[New York],1920-03-21,"THE SUN AND NEW YORK HERALD, SUNDAY, MARCH T!t..."
7,imperial valley press.,[El Centro],1920-07-29,4\nPut Your Faith—\nand Ice-—ln Ata\nAUTOMATIC...
11,new-york tribune.,[New York],1922-01-08,"p=H_S^\n""OUR MARY"" VTEL.\nCOMED HOME. Thou\nsa..."
13,new-york tribune.,[New York],1922-02-26,"V 1NKAR DOZKN STKATOIN I HIA TKICAL l \R(,E I ..."


In [48]:
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipes('ner', 'parser')  # these are unnecessary for the task at hand

def process_text(text):
    """Remove new line characters and lemmatize text. Returns string of lemmas"""
    text = text.replace('\n', ' ')
    doc = nlp(text)
    tokens = [token for token in doc]
    no_stops = [token for token in tokens if not token.is_stop]
    no_punct = [token for token in no_stops if token.is_alpha]
    lemmas = [token.lemma_ for token in no_punct]
    lemmas_lower = [lemma.lower() for lemma in lemmas]
    lemmas_string = ' '.join(lemmas_lower)
    return lemmas_string

In [49]:
df['lemmas'] = df['raw_text'].apply(process_text)

In [50]:
df.head()

Unnamed: 0,title,city,date,raw_text,lemmas
17,new-york tribune.,[New York],1909-06-27,4\nE. B. BALDWINS PLAN TO DRIFT ACROSS THE ARC...,baldwins plan drift arctic ocean camped ice pa...
10,sun and the new york herald.,[New York],1920-03-21,"THE SUN AND NEW YORK HERALD, SUNDAY, MARCH T!t...",sun new york herald sunday march great sportin...
7,imperial valley press.,[El Centro],1920-07-29,4\nPut Your Faith—\nand Ice-—ln Ata\nAUTOMATIC...,faith ata automatic refrigerator tjte patent j...
11,new-york tribune.,[New York],1922-01-08,"p=H_S^\n""OUR MARY"" VTEL.\nCOMED HOME. Thou\nsa...",p mary vtel comed home thou sand person jam l ...
13,new-york tribune.,[New York],1922-02-26,"V 1NKAR DOZKN STKATOIN I HIA TKICAL l \R(,E I ...",v dozkn stkatoin hia tkical l s sanderson amnu...


In [51]:
df.to_csv(f'{search_term}{start_date}-{end_date}.csv', index=False)