<a href="https://colab.research.google.com/github/mark-bell-tna/ComputationalAccess/blob/main/UKGWA_AtoZ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests;      #used for connecting to the API
import sys
from time import sleep
from math import log
import os
from urllib.request import urlopen
import re
from operator import itemgetter

In [2]:
if 'google.colab' in str(get_ipython()):
    environment = "Colab"
elif 'BINDER_SERVICE_HOST' in os.environ:
    environment = "Binder"
else:
    environment = "Unknown"

In [None]:
# For development purposes only when making changes in Github
import shutil
shutil.rmtree('ComputationalAccess')

In [3]:
if environment == "Colab":
    !git clone https://github.com/mark-bell-tna/ComputationalAccess.git
    sys.path.insert(0, 'ComputationalAccess')
    github_data = "ComputationalAccess/Data/"
    os.listdir(github_data)
    # Connect to gdrive
    from google.colab import drive
    drive.mount('/content/gdrive')
    data_folder = "/content/gdrive/My Drive/Data/"
else:
    github_data = "Data/"
    data_folder = "Data/"


Cloning into 'ComputationalAccess'...
remote: Enumerating objects: 91, done.[K
remote: Counting objects: 100% (91/91), done.[K
remote: Compressing objects: 100% (89/89), done.[K
remote: Total 91 (delta 44), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (91/91), done.
Mounted at /content/gdrive


In [4]:
from ukgwa_index import UKGWAIndex
from text_utils import SuffixTree, text_to_parts
from web_structure import UKGWAStructure
from ukgwa_query import QueryEngine

In [5]:
# Get entries from AtoZ index

refresh = False

print("Getting index...")
idx = UKGWAIndex()
if refresh:
    idx.indexfromweb()  # Read the A to Z index from the UKGWA website
else:
    idx.indexfromfile(data_folder + "atoz_index.txt")  # Read from a saved file

print("Loaded index...")

Getting index...
Loaded index...


In [6]:
# Updated the entries with Discovery catalogue references

idx.discoveryfromfile(github_data + "discovery_ukgwa_links.txt")

AttributeError: ignored

In [None]:
# Test the last command worked - should see a catalogue reference in last position of list
# If it says 'N' then try a few other numbers.
idx.index['UKGWA.100']

['UKGWA.100',
 'Advisory Panel on Country Information (APCI) (http://apci.homeoffice.gov.uk)',
 '*',
 'http://apci.homeoffice.gov.uk/',
 'HO 437']

In [None]:
# Only run this if you want to save the results of "indexfromweb" for next time
idx.indextofile(data_folder + "atoz_index.txt")

In [7]:
# Index the text of the index to make it searchable and to get common ngrams

st = SuffixTree(0, stopwords = set(['on','for','and','of','&','the']))

print("Loading suffixes...")
for key in idx:
    w_p = text_to_parts(idx.get_field(key, 'TEXT'))
    i = 0
    for p in w_p:
        if p[3] == "web":
            continue
        i += 1
        st.add_tokens(p[0].split(" "), key + "." + str(i))

print("Loaded")


Loading suffixes...
Loaded


In [8]:
# Print examples of top N common ngrams

print("Getting phrases...")
s = st.get_phrases(30,2,10)
s.sort(key=itemgetter(1), reverse=True)

# TODO: Need to incorporate nearest match code to get references
# Do inside suffix tree class
N = 10
topNindex = {}
for row in s[0:N]:
    this_list = [idx.lookup(st.get_ngram_reference(row[2])[:-2])]
    ngram_text = " ".join(row[0])
    print(ngram_text, row[1])
    print("\t", this_list[0])
    for r in row[4]:
        this_list.append(idx.lookup(st.get_ngram_reference(r)[:-2]))
    topNindex[ngram_text] = this_list

topNindex["National Health Service"][0]

Getting phrases...
Twitter Archive 340
	 ['UKGWA.13', 'ACAS (Advisory, Conciliation and Arbitration Service) – Twitter Archive', 'twitter', 'acasorguk', 'N']
YouTube Archive 194
	 ['UKGWA.14', 'ACAS (Advisory, Conciliation and Arbitration Service) – YouTube Archive', 'video', 'acasorguk', 'N']
National Health Service 170
	 ['UKGWA.1392', 'Department of Health (DH) – Innovation in the National Health Service (NHS)', '*', 'http://innovation.dh.gov.uk/', 'JA 346']
GOV.UK Blog 143
	 ['UKGWA.5', 'Academies and free schools – GOV.UK Blog', '*', 'https://academyschools.blog.gov.uk/', 'N']
Advisory Committee 58
	 ['UKGWA.32', 'Administration of Radioactive Substances Advisory Committee', '*', 'http://www.arsac.org.uk/', 'JA 55']
National Institute Health 48
	 ['UKGWA.3548', 'National Health Service (NHS) – National Institute for Health Research – Involve', '*', 'http://www.invo.org.uk/', 'JA 32']
Department Health 48
	 ['UKGWA.153', 'Alistair Burns (Department of Health) Blog', '*', 'http://al

['UKGWA.1392',
 'Department of Health (DH) – Innovation in the National Health Service (NHS)',
 '*',
 'http://innovation.dh.gov.uk/',
 'JA 346']

In [9]:
WS = UKGWAStructure()

for identifier in idx:
    url = idx.get_field(identifier, 'URL')
    WS.add_entry(url, identifier)

AttributeError: ignored

In [None]:
Q = QueryEngine()

In [None]:
for q in iter(Q.selections):
    print(q)

In [None]:
ngram_of_interest = "Advisory Committee"
domain_summary = {}
[Q.set_select(x[0], False, override = True) for x in topNindex['Twitter Archive']]
[Q.set_select(x[0], False, True) for x in topNindex['YouTube Archive']]
[Q.set_select(x[0], True, False) for x in topNindex[ngram_of_interest]]

for identifier in Q:
    url_parts = WS.index[identifier]
    dt = WS.domaintotree(url_parts.netloc, strip_www=True)
    dom_name = ".".join(reversed(dt[:3]))
    if dom_name in domain_summary:
        domain_summary[dom_name] += 1
    else:
        domain_summary[dom_name] = 1

sorted_domains = sorted([(k,v) for k,v in domain_summary.items()], key=itemgetter(1), reverse=True)
print(sorted_domains)
sum([v[1] for v in sorted_domains])

[('defra.gov.uk', 8), ('dh.gov.uk', 6), ('doh.gov.uk', 5), ('food.gov.uk', 5), ('ofcom.org.uk', 3), ('$.acasorguk', 2), ('$.number10gov', 2), ('$.tate', 2), ('$.dcms', 2), ('$.MHRAgovuk', 2), ('$.cabinetofficeuk', 2), ('$.spacegovuk', 2), ('$.sciencemuseum', 2), ('$.sdnpa', 2), ('$.ukhomeoffice', 2), ('$.maibgovuk', 2), ('$.bisgovuk', 2), ('$.NERCscience', 2), ('$.deccgovuk', 2), ('$.VisitBritainBiz', 2), ('$.britishcouncil', 2), ('$.VOAgovuk', 2), ('$.VisitEnglandBiz', 2), ('$.London2012', 2), ('$.CommunitiesUK', 2), ('$.ofqual', 2), ('$.networkrail', 2), ('$.BusinessLinkGov', 2), ('$.railregulation', 2), ('$.ahrcpress', 2), ('$.educationgovuk', 2), ('$.uksupremecourt', 2), ('$.dvsagovuk', 2), ('$.transportgovuk', 2), ('$.NewForestNPA', 2), ('$.britishmuseum', 2), ('$.LandRegGov', 2), ('independent.gov.uk', 2), ('mhra.gov.uk', 2), ('forestry.gov.uk', 2), ('quangos.ercouncil.org', 2), ('atipac.org.uk', 2), ('$.officestudents', 1), ('$.ucpinquiry', 1), ('$.SentencingCCL', 1), ('$.HMRCdi

595

In [None]:
for identifier in Q:
    print(idx.indexlookup(identifier))
    break

['UKGWA.13', 'ACAS (Advisory, Conciliation and Arbitration Service) – Twitter Archive', 'twitter', 'acasorguk', 'N']


In [None]:
series_summary = {}
for identifier in Q:
    entry = idx.indexlookup(identifier)
    series = entry[4].split(" ")[0]
    if series in series_summary:
        series_summary[series] += 1
    else:
        series_summary[series] = 1

sorted_series = sorted([(k,v) for k,v in series_summary.items()], key=itemgetter(1), reverse=True)
print(sorted_series)

[('N', 550), ('JA', 10), ('KM', 5), ('MAF', 5), ('JB', 3), ('OCM', 3), ('AT', 2), ('CAB', 2), ('F', 2), ('DEFE', 2), ('DFT', 2), ('FM', 1), ('RH', 1), ('HO', 1), ('WA', 1), ('CLG', 1), ('SE', 1), ('BC', 1), ('SR', 1), ('PF', 1)]
