## Chronicling America Assignment- World War II search term 

In [2]:
import requests
import json
import math
import pandas as pd
import spacy

In [3]:
url = 'https://chroniclingamerica.loc.gov/search/pages/results/?state=New+York&date1=1918&date2=1963&proxtext=World+War+II&x=0&y=0&dateFilterType=yearRange&rows=20&searchType=basic&format=json'
response = requests.get(url)
raw= response.text
results = json.loads(raw)

In [4]:
results.keys()

dict_keys(['totalItems', 'endIndex', 'startIndex', 'itemsPerPage', 'items'])

In [11]:
print(results['itemsPerPage'])
print(results['items'][1])

20
{'sequence': 7, 'county': ['New York'], 'edition': None, 'frequency': 'Daily', 'id': '/lccn/sn83045774/1922-07-06/ed-1/seq-7/', 'subject': ['New York (N.Y.)--Newspapers.', 'New York (State)--New York County.--fast--(OCoLC)fst01234953', 'New York (State)--New York.--fast--(OCoLC)fst01204333', 'New York County (N.Y.)--Newspapers.'], 'city': ['New York'], 'date': '19220706', 'title': 'The New York herald. [volume]', 'end_year': 1924, 'note': ['Archived issues are available in digital format from the Library of Congress Chronicling America online collection.', 'Available on microfilm from Library of Congress, Photoduplication Service, and New York Public Library.'], 'state': ['New York'], 'section_label': '', 'type': 'page', 'place_of_publication': 'New York, N.Y.', 'start_year': 1920, 'edition_label': '', 'publisher': 'Sun-Herald', 'language': ['English'], 'alt_title': ['Combined New York morning newspapers', 'Combined New York Sunday newspapers'], 'lccn': 'sn83045774', 'country': 'New

In [12]:
print('totalItems:', results['totalItems'])

totalItems: 53


In [13]:
total_pages = math.ceil(results['totalItems'] / results['itemsPerPage'])
print(total_pages)

3


In [14]:
data = []

In [15]:
start_date = '1918'
end_date = '1963'
search_item = 'World+War+II'
state = 'New+York'

In [16]:
for i in range(1, 4):  
    url = (f'https://chroniclingamerica.loc.gov/search/pages/results/?state={state}&date1={start_date}&date2={end_date}&proxtext={search_item}&x=0&y=0&dateFilterType=yearRange&rows=20&searchType=basic&format=json&page={i}')  # f-string
    response = requests.get(url)
    raw = response.text
    print(f'page {i} status code:', response.status_code)  # checking for errors
    results = json.loads(raw)
    items_ = results['items']
    for item_ in items_:
        row_data = {}
        try:
          row_data['title'] = item_['title_normal']
        except:
          row_data['city'] = "none"
        try:
          row_data['city'] = item_['city']
        except:
          row_data['city'] = "none"
        try:
          row_data['date'] = item_['date']
        except:
          row_data['date'] = "none"
        try:
          row_data['raw_text'] = item_['ocr_eng']
        except:
          row_data['raw_text'] = 'none'
    data.append(row_data)

page 1 status code: 200
page 2 status code: 200
page 3 status code: 200


In [19]:
df = pd.DataFrame.from_dict(data)

In [23]:
df

Unnamed: 0,title,city,date,raw_text
0,evening world.,[New York],19221021,FAMILY SPLIT OVER HALL MARRIAGE\nTo-Night' Wea...
1,sun.,[New York],19180624,11\nClassification of Baseball as a Non-Essent...
2,sun.,[New York],19191224,16\nRobinson Rides Breeze and\nFALL DATE LIKEL...


In [24]:
df['date']=pd.to_datetime(df['date'])

In [25]:
df

Unnamed: 0,title,city,date,raw_text
0,evening world.,[New York],1922-10-21,FAMILY SPLIT OVER HALL MARRIAGE\nTo-Night' Wea...
1,sun.,[New York],1918-06-24,11\nClassification of Baseball as a Non-Essent...
2,sun.,[New York],1919-12-24,16\nRobinson Rides Breeze and\nFALL DATE LIKEL...


In [27]:
df = df.sort_values(by='date')
df

Unnamed: 0,title,city,date,raw_text
1,sun.,[New York],1918-06-24,11\nClassification of Baseball as a Non-Essent...
2,sun.,[New York],1919-12-24,16\nRobinson Rides Breeze and\nFALL DATE LIKEL...
0,evening world.,[New York],1922-10-21,FAMILY SPLIT OVER HALL MARRIAGE\nTo-Night' Wea...


In [30]:
nlp = spacy.load('en_core_web_sm')

def process_text(text):
    text = text.replace('\n', ' ')
    doc = nlp(text)
    tokens = [token for token in doc]
    no_stops = [token for token in tokens if not token.is_stop]
    no_punct = [token for token in no_stops if token.is_alpha]
    lemmas = [token.lemma_ for token in no_punct]
    lemmas_lower = [lemma.lower() for lemma in lemmas]
    lemmas_string = ' '.join(lemmas_lower)
    return lemmas_string

In [31]:
df['lemmas'] = df['raw_text'].apply(process_text)

In [33]:
df.to_csv(f'../Homework/{search_item}{start_date}-{end_date}.csv', index = False)