<a href="https://colab.research.google.com/github/mark-bell-tna/ComputationalAccess/blob/main/UKGWA_AtoZ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests;      #used for connecting to the API
import sys
from time import sleep
from math import log
import os
from urllib.request import urlopen
import re
from operator import itemgetter

In [6]:
import shutil
shutil.rmtree('ComputationalAccess')

In [7]:
!git clone https://github.com/mark-bell-tna/ComputationalAccess.git

Cloning into 'ComputationalAccess'...
remote: Enumerating objects: 65, done.[K
remote: Counting objects: 100% (65/65), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 65 (delta 29), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (65/65), done.


In [8]:
sys.path.insert(0, 'ComputationalAccess')
os.listdir("ComputationalAccess/Data")

['discovery_ukgwa_links.txt']

In [9]:
from ukgwa_index import UKGWAIndex
from text_utils import SuffixTree, text_to_parts
from web_structure import UKGWAStructure

In [10]:
RunningInCOLAB = 'google.colab' in str(get_ipython())
print("Running in colab?:",RunningInCOLAB)
print("iPython string:",get_ipython())

Running in colab?: True
iPython string: <google.colab._shell.Shell object at 0x7f526cbe87f0>


In [11]:
if RunningInCOLAB:
    # Connect to gdrive
    from google.colab import drive
    drive.mount('/content/gdrive')
    root_folder = "/content/gdrive/My Drive/"
else:
    root_folder = "./"

data_folder = root_folder + "Data/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [12]:
# Get entries from AtoZ index

refresh = False

print("Getting index...")
idx = UKGWAIndex()
if refresh:
    idx.indexfromweb()  # Read the A to Z index from the UKGWA website
else:
    idx.indexfromfile(data_folder + "atoz_index.txt")  # Read from a saved file

print("Loaded index...")

Getting index...
Loaded index...


In [13]:
# Updated the entries with Discovery catalogue references

idx.discoveryfromfile("ComputationalAccess/Data/discovery_ukgwa_links.txt")

In [14]:
# Test the last command worked - should see a catalogue reference in last position of list
# If it says 'N' then try a few other numbers.
idx.index['UKGWA.100']

['UKGWA.100',
 'Advisory Panel on Country Information (APCI) (http://apci.homeoffice.gov.uk)',
 '*',
 'http://apci.homeoffice.gov.uk/',
 'HO 437']

In [15]:
# Only run this if you want to save the results of "indexfromweb" for next time
idx.indextofile(data_folder + "atoz_index.txt")

In [16]:
# Index the text of the index to make it searchable and to get common ngrams

st = SuffixTree(0, set(['on','for','and','of','&','the']))

print("Loading suffixes...")
for w in idx:
    w_p = text_to_parts(w[1])
    i = 0
    for p in w_p:
        if p[3] == "web":
            continue
        i += 1
        st.add_tokens(p[0].split(" "), w[0] + "." + str(i))

print("Loaded")


Loading suffixes...
Loaded


In [22]:
# Print examples of top N common ngrams

print("Getting phrases...")
s = st.get_phrases(30,2,10)
s.sort(key=itemgetter(1), reverse=True)

# TODO: Need to incorporate nearest match code to get references
# Do inside suffix tree class
N = 10
topNindex = {}
for row in s[0:N]:
    this_list = [idx.indexlookup(st.get_ngram_reference(row[2])[:-2])]
    ngram_text = " ".join(row[0])
    print(ngram_text, row[1])
    print("\t", this_list[0])
    for r in row[4]:
        this_list.append(idx.indexlookup(st.get_ngram_reference(r)[:-2]))
    topNindex[ngram_text] = this_list

topNindex["National Health Service"][0]

Getting phrases...
Twitter Archive 340
	 ['UKGWA.13', 'ACAS (Advisory, Conciliation and Arbitration Service) – Twitter Archive', 'twitter', 'acasorguk', 'N']
YouTube Archive 194
	 ['UKGWA.14', 'ACAS (Advisory, Conciliation and Arbitration Service) – YouTube Archive', 'video', 'acasorguk', 'N']
National Health Service 170
	 ['UKGWA.1392', 'Department of Health (DH) – Innovation in the National Health Service (NHS)', '*', 'http://innovation.dh.gov.uk/', 'JA 346']
GOV.UK Blog 143
	 ['UKGWA.5', 'Academies and free schools – GOV.UK Blog', '*', 'https://academyschools.blog.gov.uk/', 'N']
Advisory Committee 58
	 ['UKGWA.32', 'Administration of Radioactive Substances Advisory Committee', '*', 'http://www.arsac.org.uk/', 'JA 55']
National Institute Health 48
	 ['UKGWA.3548', 'National Health Service (NHS) – National Institute for Health Research – Involve', '*', 'http://www.invo.org.uk/', 'JA 32']
Department Health 48
	 ['UKGWA.153', 'Alistair Burns (Department of Health) Blog', '*', 'http://al

['UKGWA.1392',
 'Department of Health (DH) – Innovation in the National Health Service (NHS)',
 '*',
 'http://innovation.dh.gov.uk/',
 'JA 346']

In [27]:
WS = UKGWAStructure()

for entry in idx:
    identifier = entry[0]
    url = entry[3]
    WS.loadurl(url, identifier)

In [37]:
ngram_of_interest = "Advisory Committee"
domain_summary = {}
ignore = [x[0] for x in topNindex['Twitter Archive']]
ignore += [x[0] for x in topNindex['YouTube Archive']]
for entry in topNindex[ngram_of_interest]:
    identifier = entry[0]
    if identifier in ignore:
        continue
    url_parts = WS.index[identifier]
    dt = WS.domaintotree(url_parts.netloc, strip_www=True)
    dom_name = ".".join(reversed(dt[:3]))
    if dom_name in domain_summary:
        domain_summary[dom_name] += 1
    else:
        domain_summary[dom_name] = 1

sorted_domains = sorted([(k,v) for k,v in domain_summary.items()], key=itemgetter(1), reverse=True)
print(sorted_domains)

[('defra.gov.uk', 8), ('dh.gov.uk', 6), ('doh.gov.uk', 5), ('food.gov.uk', 5), ('ofcom.org.uk', 3), ('independent.gov.uk', 2), ('mhra.gov.uk', 2), ('forestry.gov.uk', 2), ('quangos.ercouncil.org', 2), ('atipac.org.uk', 2), ('arsac.org.uk', 1), ('hse.gov.uk', 1), ('honours.gov.uk', 1), ('dptac.gov.uk', 1), ('english-heritage.org.uk', 1), ('ssac.org.uk', 1), ('blog.gov.uk', 1), ('$.acme-uk.org', 1), ('homeoffice.gov.uk', 1), ('wales.gov.uk', 1), ('acnfp.gov.uk', 1), ('communities.gov.uk', 1), ('sacn.gov.uk', 1), ('pesticides.gov.uk', 1), ('lawcom.gov.uk', 1), ('deac.org.uk', 1), ('pasa.nhs.uk', 1), ('dwp.gov.uk', 1), ('seac.gov.uk', 1), ('acoba.gov.uk', 1), ('berr.gov.uk', 1), ('plr.uk.com', 1)]


In [38]:
series_summary = {}
ignore = [x[0] for x in topNindex['Twitter Archive']]
ignore += [x[0] for x in topNindex['YouTube Archive']]
for entry in topNindex[ngram_of_interest]:
    identifier = entry[0]
    if identifier in ignore:
        continue
    series = entry[4].split(" ")[0]
    if series in series_summary:
        series_summary[series] += 1
    else:
        series_summary[series] = 1

sorted_series = sorted([(k,v) for k,v in series_summary.items()], key=itemgetter(1), reverse=True)
print(sorted_series)

[('N', 14), ('JA', 10), ('KM', 5), ('MAF', 5), ('JB', 3), ('OCM', 3), ('AT', 2), ('CAB', 2), ('F', 2), ('DEFE', 2), ('DFT', 2), ('FM', 1), ('RH', 1), ('HO', 1), ('WA', 1), ('CLG', 1), ('SE', 1), ('BC', 1), ('SR', 1), ('PF', 1)]
