In [1]:
import csv
import json
import requests

# for later, when working with local files
import glob
import os
from os.path import join

In [2]:
endpoint = 'https://www.loc.gov/free-to-use'
parameters = {
    'fo' : 'json'
}

In [3]:
collection = 'farm-life'

In [12]:
collection_list_response = requests.get(endpoint + '/' + collection, params=parameters)

In [16]:
collection_json = collection_list_response.json()

In [18]:
collection_set_list = os.path.join('data','farm_set_list.csv')
headers = ['image','link','title','alt'] #alt is description field in this collection

with open(collection_set_list, 'w', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=headers)
    writer.writeheader()
    for item in collection_json['content']['set']['items']:
        
        # clean up errant spaces in the title fields
        item['title'] = item['title'].rstrip()
        writer.writerow(item)
    print('wrote',collection_set_list)

wrote data\farm_set_list.csv


In [19]:
# update endpoint info
endpoint = 'https://www.loc.gov'
parameters = {
    'fo' : 'json'
}

Query the API:

In [25]:
item_count = 0
error_count = 0
file_count = 0

data_directory = 'data'
item_metadata_directory = 'ftu_farms_metadata'
item_metadata_file_start = 'item_metadata'
json_suffix = '.json'

collection_set_list = os.path.join('data','farm_set_list.csv')

with open(collection_set_list, 'r', encoding='utf-8', newline='') as f:
    reader = csv.DictReader(f, fieldnames=headers)
    for item in reader:
        if item['link'] == 'link':
            continue
        # these resource links could redirect to item pages, but currently don't work
        if '?' in item['link']:
            resource_ID = item['link']
            short_ID = item['link'].split('/')[2]
            item_metadata = requests.get(endpoint + resource_ID + '&fo=json')
            print('requested',item_metadata.url,item_metadata.status_code)
            if item_metadata.status_code != 200:
                print('requested',item_metadata.url,item_metadata.status_code)
                error_count += 1
                continue
            try:
                item_metadata.json()
            except: #basically this catches all of the highsmith photos with hhh in the ID
                error_count += 1
                print('no json found')
                continue
            fout = os.path.join(data_directory, item_metadata_directory, str(item_metadata_file_start + '-' + short_ID + json_suffix))
            with open(fout, 'w', encoding='utf-8') as json_file:
                json_file.write(json.dumps(item_metadata.json()['item']))
                file_count += 1
                print('wrote', fout)
            item_count += 1
        else:
            resource_ID = item['link']
            short_ID = item['link'].split('/')[2]
            item_metadata = requests.get(endpoint + resource_ID, params=parameters)
            print('requested',item_metadata.url,item_metadata.status_code)
            if item_metadata.status_code != 200:
                print('requested',item_metadata.url,item_metadata.status_code)
                error_count += 1
                continue
            try:
                item_metadata.json()
            except:
                error_count += 1
                print('no json found')
                continue
            fout = os.path.join(data_directory, item_metadata_directory, str(item_metadata_file_start + '-' + short_ID + json_suffix))
            with open(fout, 'w', encoding='utf-8') as json_file:
                json_file.write(json.dumps(item_metadata.json()['item']))
                file_count += 1
                print('wrote', fout)
            item_count += 1

print('--- mini LOG ---')
print('items requested:',item_count)
print('errors:',error_count)
print('files written:',file_count)

requested https://www.loc.gov/resource/ds.00876/?fo=json 200
wrote data\ftu_farms_metadata\item_metadata-ds.00876.json
requested https://www.loc.gov/resource/fsa.8a03169/?fo=json 200
wrote data\ftu_farms_metadata\item_metadata-fsa.8a03169.json
requested https://www.loc.gov/resource/cph.3b48137/?fo=json 200
wrote data\ftu_farms_metadata\item_metadata-cph.3b48137.json
requested https://www.loc.gov/resource/fsa.8b23028/?fo=json 200
wrote data\ftu_farms_metadata\item_metadata-fsa.8b23028.json
requested https://www.loc.gov/resource/fsac.1a34309/?fo=json 200
wrote data\ftu_farms_metadata\item_metadata-fsac.1a34309.json
requested https://www.loc.gov/resource/cph.3c13454/?fo=json 200
wrote data\ftu_farms_metadata\item_metadata-cph.3c13454.json
requested https://www.loc.gov/resource/cph.3b48715/?fo=json 200
wrote data\ftu_farms_metadata\item_metadata-cph.3b48715.json
requested https://www.loc.gov/resource/cph.3b02010/?fo=json 200
wrote data\ftu_farms_metadata\item_metadata-cph.3b02010.json
requ

## Transformation


In [24]:
current_loc = os.getcwd()

print(current_loc)

c:\Users\Meghan\Desktop\networked-services-labs-main


In [26]:
metadata_file_path = os.path.join('data','ftu_farms_metadata')

print(metadata_file_path)

data\ftu_farms_metadata


In [28]:
file_count = 0

for file in glob.glob('data/ftu_farms_metadata/item_metadata-*.json'):
    file_count += 1
    #print(file)
    
print('found',file_count)

found 50


In [29]:
list_of_item_metadata_files = list() 
for file in glob.glob('data/ftu_farms_metadata/item_metadata-*.json'):
    list_of_item_metadata_files.append(file)

In [30]:
len(list_of_item_metadata_files)

50

# Transformation Part 2: Write your CSV



In [32]:
# for purposes of demonstration, use this block to make sure there isn't already a list file:

items_data_file = os.path.join(data_directory, 'farm_collection_items_data.csv')

if os.path.isfile(items_data_file):
    os.unlink(items_data_file)
    print('removed',items_data_file)

# clear row_dict
row_dict = ()

In [33]:
from datetime import date

date_string_for_today = date.today().strftime('%Y-%m-%d') # see https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior

print(date_string_for_today)

2022-12-12


In [38]:
# set up the containers to create the csv & counters 
# file for csv to read out
collection_info_csv = os.path.join('data','farm_collection_items_data.csv')
file_count = 0
items_written = 0
error_count = 0

# add in a couple of extras for Omeka, including item type and date uploaded

# set up a list for the columns in your csv
headers = ['item_type', 'date_uploaded', 'source_file', 'call_number', 'title', 'creators', 'contributor_names', 'date', 'original_format', 'format', 'subjects', 'description', 'rights_advisory', 'image_url']

for file in list_of_item_metadata_files:
    file_count += 1
    print('opening',file)
    with open(file, 'r', encoding='utf-8') as item:
        # load the item data
        try:
            item_data = json.load(item)
        except:
            print('error loading',file)
            error_count += 1
            continue

        item_type = 'Item'
        date_uploaded = date_string_for_today
        source_file = str(file)
        try:
            call_number = item_data['call_number']
        except:
            call_number = 'Not found'
        title = item_data['title']
        try:
            creators = item_data['item']['creators'][0]['title']
        except:
            creators = 'Not found'
        try:
            contributor_names = item_data['contributor_names'][0]
        except:
            contributor_names = 'Not found'
        date = item_data['date']
        try:
            original_format = item_data['original_format'][0]
        except:
            original_format = item_data['type'][0]
        try:
            format = item_data['item']['format'][0]
        except:
            format = item_data['online_format'][0]
        try:
            subjects = item_data['item']['subjects']
        except:
            subjects = 'Not found'
        try:
            description = item_data['description']
        except:
            description = 'Not found'
        try:
            rights_advisory = item_data['rights_advisory']
        except:
            rights_advisory = 'Undetermined'
        try:
            image_url = item_data['image_url'][3]
        except:
            image_url = item_data['resources'][0]['image']


        # dictionary for the rows
        row_dict = dict()
        row_dict['item_type'] = item_type
        row_dict['date_uploaded'] = date_uploaded
        row_dict['source_file'] = source_file
        row_dict['call_number'] = call_number
        row_dict['title'] = title
        row_dict['creators'] = creators
        row_dict['contributor_names'] = contributor_names
        row_dict['date'] = date
        row_dict['original_format'] = original_format
        row_dict['format'] = format
        row_dict['subjects'] = subjects
        row_dict['description'] = description
        row_dict['rights_advisory'] = rights_advisory
        row_dict['image_url'] = image_url
        #print('created row dictionary:',row_dict)

        # write to the csv
        with open(collection_info_csv, 'a', encoding='utf-8', newline='') as fout:
            writer = csv.DictWriter(fout, fieldnames=headers)
            if items_written == 0:
                writer.writeheader()
            writer.writerow(row_dict)
            items_written += 1

print('\n\n--- LOG ---')
#print('wrote',collection_info_csv)
#print('with',items_written,'items')
print(error_count,'errors (info not written)')

opening data/ftu_farms_metadata\item_metadata-afc1982009.afc1982009_le_077.json
opening data/ftu_farms_metadata\item_metadata-afc2012033.afc2012033_00505_ph.json
opening data/ftu_farms_metadata\item_metadata-cph.3b02010.json
opening data/ftu_farms_metadata\item_metadata-cph.3b03797.json
opening data/ftu_farms_metadata\item_metadata-cph.3b08594.json
opening data/ftu_farms_metadata\item_metadata-cph.3b12705.json
opening data/ftu_farms_metadata\item_metadata-cph.3b19178.json
opening data/ftu_farms_metadata\item_metadata-cph.3b43254.json
opening data/ftu_farms_metadata\item_metadata-cph.3b48137.json
opening data/ftu_farms_metadata\item_metadata-cph.3b48715.json
opening data/ftu_farms_metadata\item_metadata-cph.3c13454.json
opening data/ftu_farms_metadata\item_metadata-cph.3c18574.json
opening data/ftu_farms_metadata\item_metadata-cph.3g14730.json
opening data/ftu_farms_metadata\item_metadata-ds.00876.json
opening data/ftu_farms_metadata\item_metadata-fsa.8a03096.json
opening data/ftu_farms