In [93]:
import os
import pandas as pd
from ast import literal_eval
from bs4 import BeautifulSoup
import requests
import urllib.request
import json
from datetime import datetime
from collections import Counter

Role: https://www.gov.uk/api/content/government/ministers/attorney-general

People: https://www.gov.uk/api/content/government/people/matthew-hancock and list of all people: https://www.gov.uk/api/content/government/people

In [2]:
DATA_DIR = os.getenv("DATA_DIR")

## Extract people-related content and metadata

In [38]:
def extract_html_links(text):
    """
    Grab any GOV.UK domain-specific (people) links from page text.
    :param text: Text within a details sub-section, refer to filtered for keys.
    :return: list of links
    """
    links = []
    try:
        soup = BeautifulSoup(text, "html.parser")
        links = [link.get('href') for link in soup.findAll('a', href=True)]
    except Exception:
        print("error")
    return [l.replace("https://www.gov.uk/", "/") for l in links
            if l.startswith("/government/people")]

In [None]:
# ## trial
# "https://www.gov.uk/government/people"
# url = requests.get("https://www.gov.uk/government/people")
# htmltext = url.text
# int("https://www.gov.uk/government/people?page=2".split("=")[-1])

In [39]:
all_people = []
ind = 1
next_page = 1
while ind == next_page:
    target = "https://www.gov.uk/government/people?page={}".format(ind)
    url = requests.get(target)
    link_list = extract_html_links(url.text)
    all_people.extend(link_list[0:-1])
    ind+=1
    next_page = int(link_list[-1].split("=")[-1])
    if ind%10==0:
        print(f"at index: {ind}")

at: 10
at: 20
at: 30
at: 40
at: 50
at: 60
at: 70


In [36]:
with open(os.path.join(DATA_DIR, "people_urls.csv"), "w") as write:
    for p in all_people:
        write.write(p+"\n")

### Use people list to call content api and store relevant content

In [46]:
people_content = []
not_found = []
for i, people_url in enumerate(all_people):
    try:
        url = "https://www.gov.uk/api/content" + people_url
        content_item = json.loads(urllib.request.urlopen(url).read())
        people_content.append(content_item)
    except Exception:
        # logger.debug("Url \'{}\' not found".format(url))
        not_found.append(people_url)
    if i%500==0:
        print(f'at: {i}')
        
df = pd.DataFrame(people_content)
print('saving output...')
df.to_csv(os.path.join(DATA_DIR, "people_content_store.csv"), index=False)

at: 0
at: 500
at: 1000
at: 1500
at: 2000
at: 2500
at: 3000
at: 3500


In [48]:
len(not_found), not_found

(2,
 ['/government/people/david-whitehall', '/government/people/matthew-purves'])

### Preprocess appointment data in people content store data

In [58]:
df.iloc[0]

analytics_identifier                                                               None
base_path                                               /government/people/aarti-thakor
content_id                                         6ee62b76-59da-4dab-a977-a2a489ef713e
description                                                                        None
details                               {'image': {'url': 'https://assets.publishing.s...
document_type                                                                    person
first_published_at                                        2018-05-21T10:07:35.000+00:00
links                                 {'ordered_current_appointments': [{'content_id...
locale                                                                               en
phase                                                                              live
public_updated_at                                         2018-05-21T10:07:34.000+00:00
publishing_app                  

In [51]:
df.iloc[0].links['ordered_current_appointments']

[{'content_id': 'b2473998-5b17-46b8-a29a-1ee845590b44',
  'document_type': 'role_appointment',
  'locale': 'en',
  'public_updated_at': '2018-06-08T09:52:49Z',
  'schema_name': 'role_appointment',
  'title': 'Aarti Thakor - Director of Legal Services',
  'withdrawn': False,
  'details': {'started_on': '2018-05-21T00:00:00+01:00', 'ended_on': None},
  'links': {}}]

In [64]:
links_col = df.columns.get_loc('links')
base_path_col = df.columns.get_loc('base_path')
apps = {}
for tup in df.itertuples(index=False):
    apps[tup[base_path_col]] = {"ordered_previous_appointments": [],
                                "ordered_current_appointments": []}
    if "ordered_previous_appointments" in tup[links_col].keys():
        apps[tup[base_path_col]]['ordered_previous_appointments'] = \
                                    tup[links_col]['ordered_previous_appointments']
    if "ordered_current_appointments" in tup[links_col].keys():
        apps[tup[base_path_col]]['ordered_current_appointments'] = \
                                    tup[links_col]['ordered_current_appointments']


In [65]:
df['current_appointments'] = df['base_path'].map(lambda x: apps[x]\
                                                 ['ordered_current_appointments'])
df['previous_appointments'] = df['base_path'].map(lambda x: apps[x]\
                                                 ['ordered_previous_appointments'])

In [66]:
df

Unnamed: 0,analytics_identifier,base_path,content_id,description,details,document_type,first_published_at,links,locale,phase,...,publishing_scheduled_at,redirects,rendering_app,scheduled_publishing_delay_seconds,schema_name,title,updated_at,withdrawn_notice,current_appointments,previous_appointments
0,,/government/people/aarti-thakor,6ee62b76-59da-4dab-a977-a2a489ef713e,,{'image': {'url': 'https://assets.publishing.s...,person,2018-05-21T10:07:35.000+00:00,{'ordered_current_appointments': [{'content_id...,en,live,...,,,whitehall-frontend,,person,Aarti Thakor,2019-06-07T10:31:06.519Z,{},[{'content_id': 'b2473998-5b17-46b8-a29a-1ee84...,[]
1,,/government/people/abigail-seager,226a7fc5-1bb3-4916-b921-cbe9132706e7,,{'image': {'url': 'https://assets.publishing.s...,person,2018-04-30T09:40:24.000+00:00,{'ordered_current_appointments': [{'content_id...,en,live,...,,,whitehall-frontend,,person,Abigail Seager,2019-06-07T10:30:23.063Z,{},[{'content_id': '573c7621-6160-4df7-9029-8ffab...,[]
2,,/government/people/adam-sewell-jones,f7c098f9-fba8-4721-b3f8-edcb9890f665,,{'image': {'url': 'https://assets.publishing.s...,person,2015-05-27T17:05:42.000+00:00,{'ordered_current_appointments': [{'content_id...,en,live,...,,,whitehall-frontend,,person,Adam Sewell-Jones,2019-06-07T10:18:55.867Z,{},[{'content_id': '5b05bc67-27ee-4b41-b9df-2cfba...,[]
3,,/government/people/adam-williams,bab0f5d6-782e-4fae-bb5c-a1b60586bcc2,,{'image': {'url': 'https://assets.publishing.s...,person,2017-11-14T11:53:38.000+00:00,{'ordered_current_appointments': [{'content_id...,en,live,...,,,whitehall-frontend,,person,Adam Williams,2019-06-07T11:22:21.438Z,{},[{'content_id': '0a667a38-3828-45dd-bded-8fd70...,[]
4,,/government/people/adam-kettle-williams,6c60a966-a9a0-4923-9ac7-4cb6d8f39e30,,{'image': {'url': 'https://assets.publishing.s...,person,2015-04-29T14:05:47.000+00:00,{'ordered_previous_appointments': [{'content_i...,en,live,...,,,whitehall-frontend,,person,Adam Kettle-Williams,2019-06-07T10:16:42.846Z,{},[],[{'content_id': '2bf50ed9-14d5-43a8-a643-457cc...
5,,/government/people/adam-sambrook,853fc9e1-c0f1-11e4-8223-005056011aef,,{'image': {'url': 'https://assets.publishing.s...,person,2014-05-15T13:49:09.000+00:00,{'ordered_previous_appointments': [{'content_i...,en,live,...,,,whitehall-frontend,,person,Adam Sambrook,2019-06-07T10:41:24.446Z,{},[],[{'content_id': '391b7270-1c8d-4b28-ac18-09e19...
6,,/government/people/adam-singer,853ff18d-c0f1-11e4-8223-005056011aef,,{'image': {'url': 'https://assets.publishing.s...,person,2014-06-24T14:55:42.000+00:00,{'ordered_current_appointments': [{'content_id...,en,live,...,,,whitehall-frontend,,person,Adam Singer,2019-06-07T10:08:06.894Z,{},[{'content_id': '1f3a0e8c-121f-4c58-81f0-faa41...,[]
7,,/government/people/adele-downey,8534dd4b-c0f1-11e4-8223-005056011aef,,{'image': {'url': 'https://assets.publishing.s...,person,2013-03-19T17:16:56.000+00:00,{'ordered_previous_appointments': [{'content_i...,en,live,...,,,whitehall-frontend,,person,Adele Downey,2019-06-07T10:40:59.277Z,{},[],[{'content_id': '1e7e653b-a1b0-4b48-8b9b-e5de5...
8,,/government/people/george-zambellas,85374886-c0f1-11e4-8223-005056011aef,,{'image': {'url': 'https://assets.publishing.s...,person,2013-05-01T15:14:47.000+00:00,{'ordered_previous_appointments': [{'content_i...,en,live,...,,,whitehall-frontend,,person,Admiral Sir George Zambellas KCB DSC ADC,2019-06-07T10:41:03.401Z,{},[],[{'content_id': 'd9c1f6e7-26cf-40ed-b565-d00f2...
9,,/government/people/mark-stanhope,852c1cdd-c0f1-11e4-8223-005056011aef,,{'image': {'url': 'https://assets.publishing.s...,person,2012-02-28T13:08:49.000+00:00,{'ordered_previous_appointments': [{'content_i...,en,live,...,,,whitehall-frontend,,person,Admiral Sir Mark Stanhope GCB OBE ADC,2019-06-07T09:55:31.175Z,{},[],[{'content_id': '63fcf25e-4f8d-4f7e-a189-55d1a...


In [67]:
print('saving output with appointments...')
df.to_csv(os.path.join(DATA_DIR, "people_content_store.csv"), index=False)

saving output with appointments...


In [74]:
def extract_title(app_list, all_list):
    if len(app_list) > 0:
        all_list.extend([item['title'] for item in app_list])

In [78]:
current_app_col = df.columns.get_loc('current_appointments')
previous_app_col = df.columns.get_loc('previous_appointments')
titles = []
for tup in df.itertuples(index=False):
    extract_title(tup[current_app_col], titles)
    extract_title(tup[previous_app_col], titles)
titles = list(set(titles))

In [79]:
len(titles)

4377

In [100]:
titles[0:10]

['The Rt Hon David Cameron - First Lord of the Treasury',
 'Gill  Fraser - Deputy Head of Mission, France',
 'Lucy Wylde - General Counsel',
 'Frank  Baker - British Ambassador to Iraq',
 'Gerard Connell - Non-executive Board Member',
 'Simon  Blanchflower - Chief Executive Officer, East West Rail Company',
 'The Rt Hon Greg Clark MP - Financial Secretary to the Treasury',
 'Colin Dick - Deputy Head of Mission, British High Commission Bridgetown',
 'Dr Carole  Crofts - British Ambassador to Azerbaijan',
 'Shalini Khemka - Non-executive Board Director']

In [94]:
clean_titles = [" ".join(title.split(" - ")[1:]) for title in titles]
title_counts = Counter(clean_titles)

In [108]:
for i,title in enumerate([title for title in titles if 
                        title.split(" - ")[-1]=="Prime Minister"]):
    print(f'{i+1}. {title}')

1. The Rt Hon Theresa May MP - Prime Minister
2. Stanley Baldwin - Prime Minister
3. George Hamilton Gordon Earl of Aberdeen - Prime Minister
4. William Cavendish-Bentinck  Duke of Portland - Prime Minister
5. Henry Addington 1st Viscount Sidmouth - Prime Minister
6. Archibald Primrose, 5th Earl of Rosebery - Prime Minister
7. James Ramsay MacDonald - Prime Minister
8. George Canning - Prime Minister
9. Augustus Henry Fitzroy, 3rd Duke of Grafton - Prime Minister
10. Sir  Edward Heath - Prime Minister
11. Charles Grey, 2nd Earl Grey - Prime Minister
12. William  Lamb, 2nd Viscount Melbourne - Prime Minister
13. Sir Winston Churchill - Prime Minister
14. William  Pitt 'The Younger' - Prime Minister
15. David  Lloyd George - Prime Minister
16. Gordon Brown - Prime Minister
17. Harold Macmillan - Prime Minister
18. Sir Alec Douglas-Home - Prime Minister
19. Sir  Robert Peel 2nd Baronet - Prime Minister
20. Lord John Russell, 1st Earl Russell - Prime Minister
21. Sir Robert Walpole - Prime

In [95]:
len(clean_titles), len(set(clean_titles)), len(title_counts)

(4377, 1898, 1898)

In [99]:
for key,value in title_counts.most_common(10):
    print(f'{key}: {value}')

Non-executive board member: 64
Member: 62
Prime Minister: 54
Non-executive Director: 52
Assistant Government Whip: 38
Permanent Secretary: 33
Government Whip, Lord Commissioner of HM Treasury: 31
Chief Executive: 31
Chief Operating Officer: 30
Chair: 27


In [101]:
"Attorney General" in title_counts.keys()

True