In [59]:
import os 
import pandas as pd
import numpy as np
from collections import Counter
from ast import literal_eval
import re

In [68]:
DATA_DIR = os.getenv("DATA_DIR")
content_path = os.path.join(DATA_DIR, "preprocessed_content_store_en_june.csv.gz")
map_path = os.path.join(DATA_DIR, "base_path_cid_map.csv")
df = pd.read_csv(content_path, compression="gzip")
mappings = pd.read_csv(map_path)

In [69]:
df.columns

Index(['base_path', 'content_id', 'title', 'description', 'document_type',
       'orgs_id', 'orgs_title', 'sbs_details', 'pages_part_of_step_nav',
       'text', 'taxons', 'locale'],
      dtype='object')

In [70]:
columns = ['orgs_id', 'orgs_title', 'sbs_details', 'pages_part_of_step_nav','taxons']
for col in columns:
    print(col)
    df[col] = df[col].map(lambda x: literal_eval(x) if not isinstance(x,float) else np.nan)

orgs_id
orgs_title
sbs_details
pages_part_of_step_nav
taxons


In [71]:
df.head()

Unnamed: 0,base_path,content_id,title,description,document_type,orgs_id,orgs_title,sbs_details,pages_part_of_step_nav,text,taxons,locale
0,/aaib-reports/aaib-investigation-to-hawker-sea...,96eacfbe-0385-45ef-9289-8428dacad258,"AAIB investigation to Hawker Sea Fury T MK 20,...","Engine failure and landing gear collapse, RNAS...",aaib_report,{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,,,Summary: The aircraft was performing in a publ...,"[{'title': 'Transport', 'content_id': 'a4038b2...",en
1,/aaib-reports/aaib-investigation-to-hph-glasfl...,1d697c99-b1d0-4855-b72d-a97d83a4fc91,"AAIB investigation to HPH Glasflugel 304 eS, G...",Front Electric Sustainer (FES) battery fire du...,aaib_report,{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,,,Summary: During a normal touchdown following a...,"[{'title': 'Transport', 'content_id': 'a4038b2...",en
2,/aaib-reports/aaib-investigation-to-ikarus-c42...,5814334a-77d0-426e-8e78-ff2b05ea6322,"AAIB investigation to Ikarus C42 FB UK, G-IKUS\t",Aircraft crashed whilst avoiding a hedge when ...,aaib_report,{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,,,Summary: The pilot was attempting to take off ...,"[{'title': 'Transport', 'content_id': 'a4038b2...",en
3,/aaib-reports/aaib-investigation-to-ikarus-c42...,c8f31c76-eab0-4be6-95a5-5e6e7f32056c,"AAIB investigation to Ikarus C42 FB100, G-CEHG\t","Overturned on landing, Farm Strip, Hardwicke, ...",aaib_report,{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,,,Summary: G-CEHG was landing on a private airst...,"[{'title': 'Transport', 'content_id': 'a4038b2...",en
4,/aaib-reports/aaib-investigation-to-ikarus-c42...,4cc0ae15-ad87-42ce-8a89-2bdb14e19e26,"AAIB investigation to Ikarus C42 FB100, G-ZAVI","Aircraft struck sheep on landing, Lundy Island...",aaib_report,{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,,,Summary: The pilot reported that this was his ...,"[{'title': 'Transport', 'content_id': 'a4038b2...",en


### Create edgelist for `content_id` --> `step` (many to one)
#### Order of steps from `sbs_details`, node/page info (`base_path`, `content_id`)

In [72]:
example = df[~df.sbs_details.isna()].iloc[2]

In [73]:
example

base_path                           /apply-30-hours-free-tax-free-childcare
content_id                             f517cd57-3c18-4bb9-aa8b-1b907e279bf9
title                             Get 30 hours free childcare: step by step
description               How to apply and get a code for 30 hours free ...
document_type                                              step_by_step_nav
orgs_id                   {'organisations': ['77c1621f-a392-4393-9d8c-99...
orgs_title                {'organisations': ['Department for Education a...
sbs_details               {'title': 'Get 30 hours free childcare: step b...
pages_part_of_step_nav    [{'analytics_identifier': None, 'api_path': '/...
text                                                                    NaN
taxons                                                                  NaN
locale                                                                   en
Name: 11488, dtype: object

In [74]:
example.sbs_details['steps']

[{'title': "Check you're eligible",
  'contents': [{'type': 'list',
    'contents': [{'text': "Check if you're eligible for 30 hours free childcare",
      'href': '/30-hours-free-childcare'},
     {'text': 'Find out what types of childcare you can use',
      'href': '/help-with-childcare-costs'}]}],
  'optional': False},
 {'title': "Check if you'll be better off",
  'contents': [{'type': 'paragraph',
    'text': 'Getting 30 hours free childcare might affect how much you get in tax credits.'},
   {'type': 'list',
    'contents': [{'text': 'Check if you’ll be better off',
      'href': '/childcare-calculator'}]}],
  'optional': False},
 {'title': 'Apply for 30 hours free childcare',
  'contents': [{'type': 'paragraph',
    'text': "There's one application for 30 hours free childcare and Tax-Free Childcare. As part of your application, you will find out if you can get both."},
   {'type': 'list',
    'contents': [{'text': 'Apply online',
      'href': '/apply-30-hours-free-childcare'}]}

In [94]:
mappings['full_part_path'] = mappings[['base_path', 'part']].\
                                    apply(lambda x: "/".join(x) if not isinstance(x[1],float)
                                         else x[0], axis=1)

In [95]:
mappings.base_path.values

array(['/aaib-reports/aaib-investigation-to-hawker-sea-fury-t-mk-20-g-rnhf',
       '/aaib-reports/aaib-investigation-to-hph-glasflugel-304-es-g-gsgs',
       '/aaib-reports/aaib-investigation-to-ikarus-c42-fb-uk-g-ikus', ...,
       '/yswiriant-gwladol', '/zoo-licence',
       '/zoo-licence-northern-ireland'], dtype=object)

Create a dictionary mapping full urls of "parts" to the `content_id` they refer to

In [100]:
map_dict = mappings[['full_part_path','content_id']].\
                            set_index('full_part_path').T.to_dict('records')[0]
map_dict.update(dict(zip(mappings.base_path, mappings.content_id)))

### Create edgelist
Also map href to actual `content_id`

In [101]:
mappings[mappings.base_path=="/adi-standards-check"]

Unnamed: 0,base_path,content_id,part,full_part_path
8744,/adi-standards-check,2574c147-77ee-4c4d-b149-546d508cb65d,overview,/adi-standards-check/overview
8745,/adi-standards-check,2574c147-77ee-4c4d-b149-546d508cb65d,book-adi-standards-check,/adi-standards-check/book-adi-standards-check
8746,/adi-standards-check,2574c147-77ee-4c4d-b149-546d508cb65d,what-to-take-to-your-standards-check,/adi-standards-check/what-to-take-to-your-stan...
8747,/adi-standards-check,2574c147-77ee-4c4d-b149-546d508cb65d,what-happens-at-the-standards-check,/adi-standards-check/what-happens-at-the-stand...
8748,/adi-standards-check,2574c147-77ee-4c4d-b149-546d508cb65d,your-standards-check-grade,/adi-standards-check/your-standards-check-grade
8749,/adi-standards-check,2574c147-77ee-4c4d-b149-546d508cb65d,old-adi-check-test-grades,/adi-standards-check/old-adi-check-test-grades


In [102]:
list(map_dict.items())[0:5]

[('/aaib-reports/aaib-investigation-to-hawker-sea-fury-t-mk-20-g-rnhf',
  '96eacfbe-0385-45ef-9289-8428dacad258'),
 ('/aaib-reports/aaib-investigation-to-hph-glasflugel-304-es-g-gsgs',
  '1d697c99-b1d0-4855-b72d-a97d83a4fc91'),
 ('/aaib-reports/aaib-investigation-to-ikarus-c42-fb-uk-g-ikus',
  '5814334a-77d0-426e-8e78-ff2b05ea6322'),
 ('/aaib-reports/aaib-investigation-to-ikarus-c42-fb100-g-cehg',
  'c8f31c76-eab0-4be6-95a5-5e6e7f32056c'),
 ('/aaib-reports/aaib-investigation-to-ikarus-c42-fb100-g-zavi',
  '4cc0ae15-ad87-42ce-8a89-2bdb14e19e26')]

In [79]:
bool(re.match("^(/|www.gov.uk)","www.gov.uk"))

True

In [104]:
rowlist = []
sbs_details_col = df.columns.get_loc('sbs_details')
cid_col = df.columns.get_loc('content_id')
base_path_col = df.columns.get_loc('base_path')
missing = []
outerlinks = []

for tup in df.itertuples(index=False):
    if not isinstance(tup[sbs_details_col],float):
        for i,step in enumerate(tup[sbs_details_col]['steps']):
            for content in step['contents']:
                if "contents" in content.keys():
                    for item in content['contents']:
                        if "href" in item.keys():
                            if re.match("^(/|www.gov.uk)",item['href']): 
                                task_cid = np.nan
                                if item['href'] in map_dict.keys():
                                    task_cid = map_dict[item['href']]
                                else:
                                    missing.append(item['href'])
                                rowlist.append({
                                    'content_id': tup[cid_col],
                                    'base_path': tup[base_path_col],
                                    'step_title': step['title'],
                                    'step_number': i+1,
                                    'task_base_path': item['href'],
                                    'task_cid' : task_cid})
                            else:
                                outerlinks.append(item['href'])
df_steps = pd.DataFrame(rowlist)

In [105]:
df_steps[df_steps.task_cid.isna()].shape

(6, 6)

In [106]:
df_steps[df_steps.task_cid.isna()].shape, df_steps.shape

((6, 6), (490, 6))

In [108]:
set(missing)

{'/contact-the-dvla/y/vehicle-tax-and-sorn',
 '/dbs-check-applicant-criminal-record/how-to-apply-for-a-check',
 '/foreign-travel-advice',
 '/government/publications/uk-trade-tariff-excise-duties-reliefs-drawbacks-and-allowances/uk-trade-tariff-excise-duties-reliefs-drawbacks-and-allowances',
 '/topic/driving-motorcyle-instructors/improving-training-skills'}

### Save out additional sbs stuff

In [None]:
# sbs_content_path = os.path.join(DATA_DIR, "preprocessed_step_by_step_content_en_june.csv.gz")

In [113]:
cid_to_step_path = os.path.join(DATA_DIR, 
                                "task_cid_is_part_of_step_title_edgelist.csv")
df_steps[['task_base_path','task_cid','step_title']].to_csv(cid_to_step_path, index=False)

In [130]:
node_list = df_steps[['step_title', 'step_number']].drop_duplicates('step_title').\
                        reset_index(drop=True).\
                        reset_index().rename(columns={'index':'step_id'})
node_list.to_csv(os.path.join(DATA_DIR, "step_title_nodelist.csv"),index=False)

In [131]:
step_title_to_cid_path = os.path.join(DATA_DIR, 
                                      "step_title_is_part_of_step_by_step_cid_edgelist.csv")
df_steps[['step_title', 'step_number', 'base_path', 'content_id']].\
                                        drop_duplicates('step_title').\
                                        to_csv(step_title_to_cid_path, index=False)