In [1]:
# imports
import requests
import json
import math
import pandas as pd
import spacy

In [7]:
# initial search
url = 'https://chroniclingamerica.loc.gov/search/pages/results/?state=Maine&date1=1918&date2=1920&proxtext=epidemic&x=20&y=8&dateFilterType=yearRange&rows=20&searchType=basic&format=json'
response = requests.get(url)
raw = response.text
results = json.loads(raw)

In [8]:
results.keys()

dict_keys(['totalItems', 'endIndex', 'startIndex', 'itemsPerPage', 'items'])

In [9]:
print('totalItems:', results['totalItems'])
print('endIndex:', results['endIndex'])
print('startIndex:', results['startIndex'])
print('itemsPerPage:', results['itemsPerPage'])
print('Length and type of items:', len(results['items']), type(results['items']))

totalItems: 1339
endIndex: 20
startIndex: 1
itemsPerPage: 20
Length and type of items: 20 <class 'list'>


In [10]:
# find total amount of pages
total_pages = math.ceil(results['totalItems'] / results['itemsPerPage'])
print(total_pages)

67


In [11]:
# create empty list for data
data = []

In [12]:
# set search parameters
start_date = '1918'
end_date = '1920'
search_term = 'epidemic'
state = 'Maine'

In [13]:
# loop through search results and collect data
for i in range(1, total_pages + 1):  # for sake of time I'm doing only 10, you will want to put total_pages+1
    url = (f'https://chroniclingamerica.loc.gov/search/pages/results/?state={state}&date1={start_date}'
           f'&date2={end_date}&proxtext={search_term}&x=16&y=8&dateFilterType=yearRange&rows=20'
           f'&searchType=basic&format=json&page={i}')  # f-string
    response = requests.get(url)
    raw = response.text
    print(f'page {i} status code:', response.status_code)  # checking for errors
    results = json.loads(raw)
    items_ = results['items']
    for item_ in items_:
        row_data = {}
        try:
          row_data['title'] = item_['title_normal']
        except:
          row_data['city'] = "none"
        try:
          row_data['city'] = item_['city']
        except:
          row_data['city'] = "none"
        try:
          row_data['date'] = item_['date']
        except:
          row_data['date'] = "none"
        try:
          row_data['raw_text'] = item_['ocr_eng']
        except:
          row_data['raw_text'] = 'none'
    data.append(row_data)

page 1 status code: 200
page 2 status code: 200
page 3 status code: 200
page 4 status code: 200
page 5 status code: 200
page 6 status code: 200
page 7 status code: 200
page 8 status code: 200
page 9 status code: 200
page 10 status code: 200
page 11 status code: 200
page 12 status code: 200
page 13 status code: 200
page 14 status code: 200
page 15 status code: 200
page 16 status code: 200
page 17 status code: 200
page 18 status code: 200
page 19 status code: 200
page 20 status code: 200
page 21 status code: 200
page 22 status code: 200
page 23 status code: 200
page 24 status code: 200
page 25 status code: 200
page 26 status code: 200
page 27 status code: 200
page 28 status code: 200
page 29 status code: 200
page 30 status code: 200
page 31 status code: 200
page 32 status code: 200
page 33 status code: 200
page 34 status code: 200
page 35 status code: 200
page 36 status code: 200
page 37 status code: 200
page 38 status code: 200
page 39 status code: 200
page 40 status code: 200
page 41 s

In [14]:
# put data into DataFrame
df = pd.DataFrame.from_dict(data)
# convert date column from string to date-time object
df['date'] = pd.to_datetime(df['date'])
# sort by date
df = df.sort_values(by='date')

In [15]:
df.head()

Unnamed: 0,title,city,date,raw_text
63,daily kennebec journal.,[Augusta],1918-01-01,FIRST DIVISION\nabs of Major Leagues\nWill Sha...
54,daily kennebec journal.,[Augusta],1918-01-07,Daily KENNEBEC JOURNAL\nTelephone 135\n»'ds of...
42,republican journal.,[Belfast],1918-02-21,"The Republican Journal^\nBELFAST, THURSDAY, FF..."
24,daily kennebec journal.,[Augusta],1918-03-09,Classified\n\d Rates\n—v...\n' *ord for 6 time...
44,independent-reporter.,[Skowhegan],1918-04-04,lical Estate Transfers\nThe records of the fol...


In [16]:
# fuction to process text
# load nlp model
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipes('ner', 'parser')  # these are unnecessary for the task at hand

def process_text(text):
    """Remove new line characters and stop words and lemmatize text. Returns string of lemmas"""
    text = text.replace('\n', ' ')
    doc = nlp(text)
    tokens = [token for token in doc]
    no_stops = [token for token in tokens if not token.is_stop]
    no_punct = [token for token in no_stops if token.is_alpha]
    lemmas = [token.lemma_ for token in no_punct]
    lemmas_lower = [lemma.lower() for lemma in lemmas]
    lemmas_string = ' '.join(lemmas_lower)
    return lemmas_string

In [17]:
# apply process_text function
df['lemmas'] = df['raw_text'].apply(process_text)

In [18]:
df.head()

Unnamed: 0,title,city,date,raw_text,lemmas
63,daily kennebec journal.,[Augusta],1918-01-01,FIRST DIVISION\nabs of Major Leagues\nWill Sha...,division ab major leagues share world series p...
54,daily kennebec journal.,[Augusta],1918-01-07,Daily KENNEBEC JOURNAL\nTelephone 135\n»'ds of...,daily kennebec journal telephone ds thank obit...
42,republican journal.,[Belfast],1918-02-21,"The Republican Journal^\nBELFAST, THURSDAY, FF...",republican belfast thursday ffb published thur...
24,daily kennebec journal.,[Augusta],1918-03-09,Classified\n\d Rates\n—v...\n' *ord for 6 time...,classified rate v ord time conaacutiv insertio...
44,independent-reporter.,[Skowhegan],1918-04-04,lical Estate Transfers\nThe records of the fol...,lical estate transfers record follow real tran...


In [20]:
# save to csv
df.to_csv(f'{search_term}{start_date}-{end_date}.csv', index=False)