First, the script imports the Python packages/libraries needed to run script: pandas, json, argparse, uuid, & datetime.

In [2]:
import pandas as pd
import json
import argparse
from datetime import datetime
#import prefixLists

Then, the script uses argparse to let us enter a filename in our terminal when we run the script. 

```
python makeArchivalObjects.py -f filename.csv
```

In [3]:
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file')
args = parser.parse_args()

if args.file:
    filename = args.file
else:
    filename = input('Enter filename (including \'.csv\'): ')
filename = 'exampleSheets_archivalObjects.csv'

Then, we see two functions:`add_to_dict` and `add_with_ref`. We can worry about these later.

In [20]:
def add_to_dict(dict_name, key, value):
    try:
        value = row.get(value)
        if pd.notna(value):
            value = value.strip()
            dict_name[key] = value
    except KeyError:
        pass


def add_with_ref(dict_name, key, value, repeat):  
    try:
        value = row[value]
        if pd.notna(value):
            if repeat == 'single':
                value = value.strip()
                dict_name[key] = {'ref': value}
            else:
                new_list = []
                value = value.split('|')
                for item in value:
                    new_dict = {'ref': item}
                    new_list.append(new_dict)
                dict_name[key] = new_list
    
    except KeyError:
        pass

    
def add_notes(dict_name, value):
    try:
        value = row[value]
        if pd.notna(value):
            notes = []
            notes_values = value.split('||')
            for note_value in notes_values:
                note_parts = note_value.split(';;') 
                first_model_type = note_parts[0]
                note_type = note_parts[1]
                second_model_type = note_parts[2]
                content = note_parts[3]
                note = {}
                note['jsonmodel_type'] = first_model_type
                note['publish'] = True
                note['subnotes'] = [{'content': content, 'jsonmodel_type': second_model_type, 'publish': True}]
                note['type'] = note_type
                notes.append(note)
            dict_name['notes'] = notes
    except KeyError:
        pass


def add_extents(dict_name, value):
    try:
        value = row[value]
        if pd.notna(value):
            extent = []
            extent_values = value.split('|')
            extent_dict = {}
            for extent_value in extent_values:
                extent_value = extent_value.split(';;')
                k = extent_value[0]
                v = extent_value[1]
                extent_dict[k] = v
            extent.append(extent_dict)
            dict_name['extent'] = extent
    except KeyError:
        pass

This next bit of code opens the CSV as a `DataFrame` called `df` and loops through its rows.

As the script loops through each row, it extracts data based on CSV column names and adds the data to a dictionary called `json_file`. This dictionary will be transformed and saved as a JSON file at the end of the loop.

In [25]:
df = pd.read_csv(filename, dtype={'position': int, 'parent': int})

df['position'] = df['position'].astype(int)
for index, row in df.iterrows():
    
    # Create empty dictionary to store data.
    json_file = {}
    json_file['jsonmodel_type'] = 'archival_object'
    json_file['suppressed'] = False
    
    # For required fields, add directly to json_file.
    identifier = row['local_id']
    json_file['title'] = row['title']
    json_file['resource'] = {'ref': row['resource']}
    json_file['level'] = row['level']
    json_file['publish'] = row['publish']
    json_file['restrictions_apply'] = row['restrictions_apply']
    
    # For optional fields, try to find value and add to json_file if found.
    add_to_dict(json_file, 'repository_processing_note', 'repository_processing_note')
    add_to_dict(json_file, 'position', 'position')
    add_to_dict(json_file, 'other_level', 'other_level')
    
    # For optional fields with 'ref' key, use function to add.
    add_with_ref(json_file, 'parent', 'parent', 'single')
    add_with_ref(json_file, 'repository', 'repository', 'single')
    add_with_ref(json_file, 'linked_events', 'linked_events', 'multi')
    add_with_ref(json_file, 'subjects', 'subjects', 'multi')
    
    # Add notes.
    add_notes(json_file, 'notes')
    
    # Add extent.
    add_extents(json_file, 'extents')
    print(json_file)
    
    dt = datetime.now().strftime('%Y-%m-%d')
    ao_filename = identifier+'_'+dt+'.json'
    directory = ''
    with open(directory+ao_filename, 'w') as fp:
        json.dump(json_file, fp)
    
    

{'jsonmodel_type': 'archival_object', 'suppressed': False, 'title': 'Series 2: Digital photographs', 'resource': {'ref': '/repositories/3/resources/7'}, 'level': 'series', 'publish': True, 'restrictions_apply': True, 'repository_processing_note': "Series 2 is currently unpublished in the finding aid. While what is included in series 2 is all processed, there is an agreed to 8 year restriction period from the time of creation and access jpegs have not yet been created. It would be a challenge to provide access to this scale of content in DSpace. What is currently in series 2 represents the processing of approximately 6,000 DVDs. There are approximately 10,000 more DVDs in accessions 2018-19.ua.007 and 2019-20.ua.012 that haven't been addressed. -EE 3/2/20", 'position': '1', 'repository': {'ref': '/repositories/3'}, 'linked_events': [{'ref': '/repositories/3/events/10094'}, {'ref': ''}, {'ref': '/repositories/3/events/10097'}], 'subjects': [{'ref': '/subjects/1171'}, {'ref': '/subjects/4

This section generates a filename (`ao_filename`) based on an identifier variable and a datetime stamp, and then uses the json function `json.dump` to write and save our dictionary into a JSON file using our unique `ao_filename`.