In [23]:
import os 
import pandas as pd 
import json
import urllib.request
from ast import literal_eval
from bs4 import BeautifulSoup
from collections import Counter

In [24]:
DATA_DIR = os.getenv("DATA_DIR")
curr_app_data = os.path.join(DATA_DIR, "current_appointment_edgelist.csv")
prev_app_data = os.path.join(DATA_DIR, "previous_appointment_edgelist.csv")

df1 = pd.read_csv(curr_app_data)
df1['app_type'] = 'current'
df2 = pd.read_csv(prev_app_data)
df2['app_type'] = 'previous'
print(df1.shape, df2.shape)
df = pd.concat([df1,df2])

if df1.shape[0] == df[df.app_type=="current"].shape[0]:
    del df1
    del df2
    df.to_csv(os.path.join(DATA_DIR, "merged_appointment_edgelist.csv"), index=False)

(1848, 7) (2627, 7)


In [25]:
### Get role data from html

In [28]:
def extract_html_links(text):
    """
    Grab any GOV.UK domain-specific (people) links from page text.
    :param text: Text within a details sub-section, refer to filtered for keys.
    :return: list of links
    """
    links = []
    try:
        soup = BeautifulSoup(text, "html.parser")
        links = [link.get('href') for link in soup.findAll('a', href=True)]
    except Exception as e:
        print("error")
        print(e)
    return [l.replace("https://www.gov.uk/", "/") for l in links if l.startswith("/")]

In [30]:
target = "https://www.gov.uk/government/ministers"
text = urllib.request.urlopen(target).read().decode('utf-8')
llist = extract_html_links(text)

In [34]:
Counter(["/".join(l.split("/")[0:3]) for l in llist if l.startswith("/government")] )

Counter({'/government/organisations': 27,
         '/government/how-government-works': 2,
         '/government/get-involved': 1,
         '/government/statistics': 1,
         '/government/people': 192,
         '/government/ministers': 177})

In [45]:
soup = BeautifulSoup(text, "html.parser")
divs = soup.findAll("div", {"class": "ministers-by-organisation"})

In [104]:
divs[1]

<div class="ministers-by-organisation whips">
<div class="inner-block floated-children">
<h2 id="whips">Whips</h2>
<section class="whitehall_whip_organisation" id="whitehall_whip_organisation_1">
<div class="heading">
<h3>House of Commons</h3>
</div>
<ul class="minister-list">
<li class="by-organisation_person person-excerpt clear-person" id="by-organisation_person_2277">
<div class="inner">
<div class="text">
<h4 class="current-appointee"><a href="/government/people/julian-smith"><span class="person-title">The Rt Hon</span> <strong>Julian Smith MP</strong></a></h4>
<p class="role">
<a href="/government/ministers/parliamentary-secretary-to-the-treasury-and-chief-whip">Parliamentary Secretary to the Treasury (Chief Whip)</a>
<span></span> </p>
</div>
</div>
</li>
<li class="by-organisation_person person-excerpt" id="by-organisation_person_2814">
<div class="inner">
<div class="text">
<h4 class="current-appointee"><a href="/government/people/christopher-pincher"><strong>Christopher  Pinc

In [111]:
row_list = []

for section in divs[0].findAll("section"):
    org = section.find("a", {"class": "organisation-logo"}).get('href')
    
    
    sections = section.findAll(["h4","p"])
#         {"class": "current-appointee"}
    for i,link in enumerate(sections):
        if i < len(sections)-1 and i%2==0:
            people = [l.get('href') for l in link.findAll('a', href=True)]
            if len(people)==1:
                row_dict = {'person':people[0],
                            'roles':[l.get('href') for l\
                                     in sections[i+1].findAll('a', href=True)],
                            'organisation':org
                           }
                row_list.append(row_dict)
            else:
                print("error")
df_roles = pd.DataFrame(row_list)
df_roles = df_roles[['person', 'roles', 'organisation']]

In [112]:
df_roles

Unnamed: 0,person,roles,organisation
0,/government/people/theresa-may,"[/government/ministers/prime-minister, /govern...",/government/organisations/cabinet-office
1,/government/people/david-lidington,[/government/ministers/minister-for-the-cabine...,/government/organisations/cabinet-office
2,/government/people/brandon-lewis,[/government/ministers/minister-without-portfo...,/government/organisations/cabinet-office
3,/government/people/dowden,[/government/ministers/parliamentary-under-sec...,/government/organisations/cabinet-office
4,/government/people/chloe-smith,[/government/ministers/parliamentary-secretary...,/government/organisations/cabinet-office
5,/government/people/kevin-foster,[/government/ministers/interim-parliamentary-s...,/government/organisations/cabinet-office
6,/government/people/philip-hammond,[/government/ministers/chancellor-of-the-exche...,/government/organisations/hm-treasury
7,/government/people/elizabeth-truss,[/government/ministers/chief-secretary-to-the-...,/government/organisations/hm-treasury
8,/government/people/jesse-norman,[/government/ministers/financial-secretary-to-...,/government/organisations/hm-treasury
9,/government/people/robert-jenrick,[/government/ministers/exchequer-secretary-to-...,/government/organisations/hm-treasury
