<a href="https://colab.research.google.com/github/mark-bell-tna/ComputationalAccess/blob/main/Web_Archive_in_Discovery.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests;      #used for connecting to the API
import sys
from time import sleep
from math import log
import os
from urllib.request import urlopen
import re
import testlibrary

In [None]:
MTC = MyTestClass("WebArchive")
MTC.printtext()

In [None]:
PAGE_LIMIT = 200  # Avoid hitting the API too hard
TOTAL_LIMIT = 500

# Select fields to extract from catalogue
# List of fields, but use a list to signify nested attributes
# e.g. the description field is within the scopeContent field ["scopeContent", "description"]
field_list = ["id","coveringDates","description", "reference", "title"]



In [None]:
def disco_search(field_list, page_limit=100, total_limit=1000):
    # Searches only for "web archive" but could be parameterised to build search string
    myparams={"limit":page_limit, "batchStartMark":"*"}
    headers={"Accept": "application/json"}; #we want the API to return data in JSON format
    page = 0
    retrieved = 0
    out_results = []
    while retrieved < total_limit:
        page += 1
        url = "https://discovery.nationalarchives.gov.uk/API/search/records?sps.searchQuery=%22web%20archive%22&sps.page=" + str(page) + "&sps.resultsPageSize=" + str(page_limit)
        s=requests.Session(); #creating a session just groups the set of requests together
        r=s.get(url, headers=headers, params=myparams); #send the url with our added parameters, call the response "r"
        r.raise_for_status(); #This checks that we received an http status 200 for the server response
        #so we know nothing's gone wrong with the call (if something has gone wrong we'd get a 404 or 500 error for example)
        rjson=r.json()
        if page == 1:
            print("Total count:",rjson["count"])
            print(rjson["records"][0])

        retrieved += len(rjson["records"])
        if len(rjson["records"]) == 0:
            break
        for rj in rjson["records"]:
            fields = [rj[x] for x in field_list]
            out_results.append(fields)
    print("Pages:",page)
    print("Total records retrieved:", retrieved)
    return out_results





In [None]:
search_results = disco_search(field_list, page_limit = PAGE_LIMIT, total_limit = TOTAL_LIMIT)

Total count: 4851
{'altName': '', 'places': [], 'corpBodies': [], 'taxonomies': [], 'formerReferenceDep': '', 'formerReferencePro': 'See Annual Return 2013', 'heldBy': ['Parliamentary Archives'], 'context': '', 'content': '', 'urlParameters': '', 'department': '', 'note': '', 'adminHistory': '', 'arrangement': '', 'mapDesignation': '', 'mapScale': '', 'physicalCondition': '', 'catalogueLevel': 0, 'openingDate': '', 'closureStatus': '', 'closureType': '', 'closureCode': '', 'documentType': '', 'coveringDates': '2009-2012', 'description': 'Bicameral Records of Parliament: records of Parliamentary Web Archive', 'endDate': '31/12/2012', 'numEndDate': 20121231, 'numStartDate': 20090101, 'startDate': '01/01/2009', 'id': 'N14203445', 'reference': 'PARL/WEB/1', 'score': 0.8886901, 'source': '300', 'title': 'Houses of Parliament, Bicameral Records of Parliament: records of Parliamentary Web Archive'}
Pages: 3
Total records retrieved: 600


In [None]:
search_results[1]

['C14215037',
 '2000 Jan 1 - 2000 Dec 31; 2013',
 'Website of re-opened formal investigation. [Please Note: These digital records are presented via the UK Government Web Archive ].',
 'MT 205/3',
 'Website of re-opened formal investigation. [Please Note: These digital records are presented via the UK...']

In [None]:
def get_ancestors(disco_id, s=requests.Session()):
    G = nx.DiGraph()
    G = {}
    headers={"Accept": "application/json"}; #we want the API to return data in JSON format
    page = 0
    retrieved = 0
    out_results = []
    url = "https://discovery.nationalarchives.gov.uk/API/records/context/" + disco_id
    #s=requests.Session(); #creating a session just groups the set of requests together
    r=s.get(url, headers=headers); #send the url with our added parameters, call the response "r"
    r.raise_for_status(); #This checks that we received an http status 200 for the server response
    #so we know nothing's gone wrong with the call (if something has gone wrong we'd get a 404 or 500 error for example)
    rjson=r.json()
    #print(rjson)
    for rj in rjson:
        G[rj['id']] = rj['parentId']

    more = True
    node = disco_id
    ancestor_tree = []
    while more:
        if node not in G:
            more = False
            continue
        ancestor = G[node]
        if ancestor == None:
            more = False
            continue
        ancestor_tree.append(ancestor)
        node = ancestor
        
    return ancestor_tree

ancestors = get_ancestors('C17172')
print(get_ancestors('N14203445'))
print(ancestors)

In [None]:
# Target format:
#{"name":"flare.animate.interpolate.ArrayInterpolator","size":1983,"imports":["flare.util.Arrays","flare.animate.interpolate.Interpolator"]}
# Source data format:
#['C17172', 'From 2006', 
#"This series contains dated gathered versions (or 'snapshots') of the Security Vetting Appeals Panel website. [Please note: These records may be accessed via the UK Government Web Archive ].",
# 'DEFE 147', 'Security Vetting Appeals Panel Website']
#myjson_file.write("[\n")
rows = 0
data = disco_search(field_list, page_limit = 100, total_limit = 500)
s=requests.Session()
tree = {}
for d in data:
#    if rows > 0:
#        myjson_file.write(",\n")
    rows += 1
    if rows % 100 == 0:
        print("Rows:", rows)
    disco_id = d[0]
    title = d[4]
    #get_ancestors(disco_id, s)
    #print(disco_id)
    ancestors = get_ancestors(disco_id)
    ancestors += ['UK']
    ancestors.reverse()
    ancestors += [disco_id]
    #print(ancestors)
    this_dict = tree
    #break
    for anc in ancestors:
        if anc not in this_dict:
            this_dict[anc] = {}
        this_dict = this_dict[anc]