### Download and Save Example Data

In [1]:
import requests
import json
import zipfile
import io
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [2]:
# UK Companies House Persons with Significant Control Download Page

url = "http://download.companieshouse.gov.uk/en_pscdata.html"

In [3]:
# Download snapshots, convert json to dataframe, remove unwanted columns and append to a single dataframe
# Ignore last file 22of22

df_psctotal = pd.DataFrame()
with requests.Session() as req:
        r = req.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        snapshots = [f"{url[:38]}{item['href']}" for item in soup.select(
            "a[href*='psc-snapshot']")]
        for snapshot in snapshots:    
            response = requests.get(snapshot).content     
            zipsnapshot = zipfile.ZipFile(io.BytesIO(response))
            if(zipsnapshot.namelist()[0][-10:] != "22of22.txt"):
                tempfile = zipsnapshot.extract(zipsnapshot.namelist()[0])
                print(zipsnapshot.namelist()[0])
                df_psc = pd.json_normalize(pd.Series(open(tempfile, encoding="utf8").readlines()).apply(json.loads))
                df_psc = df_psc[df_psc['data.ceased_on'].notnull()]
                df_psc = df_psc.dropna(subset=['data.name_elements.forename'])
                df_psc = df_psc[['company_number','data.name_elements.surname','data.name_elements.forename','data.name_elements.title','data.date_of_birth.month','data.date_of_birth.year']]  
                df_psctotal = pd.concat([df_psctotal, df_psc], ignore_index=True)

psc-snapshot-2022-12-16_1of22.txt
psc-snapshot-2022-12-16_2of22.txt
psc-snapshot-2022-12-16_3of22.txt
psc-snapshot-2022-12-16_4of22.txt
psc-snapshot-2022-12-16_5of22.txt
psc-snapshot-2022-12-16_6of22.txt
psc-snapshot-2022-12-16_7of22.txt
psc-snapshot-2022-12-16_8of22.txt
psc-snapshot-2022-12-16_9of22.txt
psc-snapshot-2022-12-16_10of22.txt
psc-snapshot-2022-12-16_11of22.txt
psc-snapshot-2022-12-16_12of22.txt
psc-snapshot-2022-12-16_13of22.txt
psc-snapshot-2022-12-16_14of22.txt
psc-snapshot-2022-12-16_15of22.txt
psc-snapshot-2022-12-16_16of22.txt
psc-snapshot-2022-12-16_17of22.txt
psc-snapshot-2022-12-16_18of22.txt
psc-snapshot-2022-12-16_19of22.txt
psc-snapshot-2022-12-16_20of22.txt
psc-snapshot-2022-12-16_21of22.txt


In [4]:
# Write the Persons with Significant Control dataframe as csv

df_psctotal.to_csv('psc_slim.csv')

In [2]:
# Maritime and Coastguard Agency - List of approved recruitment and placement agencies

url = "https://www.gov.uk/government/publications/recruitment-and-placement-agencies-approved-by-the-mca"

In [3]:
# Download the csv file, extract name and save

with requests.Session() as req:
    r = req.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    targets = [f"{item['href']}" for item in soup.select(
        "a[href$='.csv']")]
    for target in targets:
        r = req.get(target)
        name = target.rsplit("/", 1)[-1]
        with open(name, 'wb') as f:
            f.write(r.content)          

In [32]:
# Everypolitican website for UK House of Commons

url = "http://everypolitician.org/uk/commons/download.html"

In [37]:
# Download UK House of Commmons json file and save

with requests.Session() as req:
    r = req.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    targets = [f"{item['href']}" for item in soup.select(
        "a[href*='ep-popolo']")]
    print(targets)
    for target in targets:
        r = req.get(target)
        name = target.rsplit("/", 1)[-1]
        with open(name, 'wb') as f:
            f.write(r.content)   

['https://cdn.rawgit.com/everypolitician/everypolitician-data/f594edb3364d459ec5ee50f15b92261085a49bc5/data/UK/Commons/ep-popolo-v1.0.json']


In [2]:
# UK Companies House Basic Company Data download page

url="http://download.companieshouse.gov.uk/en_output.html"

In [3]:
# Download snapshots, convert json to dataframe, remove unwanted columns and append to a single dataframe

df_comptotal = pd.DataFrame()
with requests.Session() as req:
        r = req.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        snapshots = [f"{url[:38]}{item['href']}" for item in soup.select(
            "a[href*='BasicCompanyData-']")]
        for snapshot in snapshots:    
            response = requests.get(snapshot).content     
            zipsnapshot = zipfile.ZipFile(io.BytesIO(response))
            tempfile = zipsnapshot.extract(zipsnapshot.namelist()[0])
            print(zipsnapshot.namelist()[0])
            df_comp = pd.read_csv(tempfile)
            df_comp = df_comp[['RegAddress.PostCode','RegAddress.PostTown','CompanyName']]
            df_comptotal = pd.concat([df_comptotal, df_comp], ignore_index=True)

BasicCompanyData-2022-11-01-part1_7.csv


  exec(code_obj, self.user_global_ns, self.user_ns)


BasicCompanyData-2022-11-01-part2_7.csv
BasicCompanyData-2022-11-01-part3_7.csv
BasicCompanyData-2022-11-01-part4_7.csv
BasicCompanyData-2022-11-01-part5_7.csv


  exec(code_obj, self.user_global_ns, self.user_ns)


BasicCompanyData-2022-11-01-part6_7.csv
BasicCompanyData-2022-11-01-part7_7.csv


  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
# Write the Basic Company Information dataframe as csv

df_comptotal.to_csv('basic_slim.csv')

In [None]:
pd.set_option('display.max_columns', None)

def substitute_name(name, forename, surname):
    if forename is np.NaN:
        return name.split()[0], ' '.join(name.split()[1:]) 
    else: 
        return forename, surname

#               tempname = np.vectorize(substitute_name)(df_psc['data.name'], df_psc['data.name_elements.forename'],df_psc['data.name_elements.surname'])
#                df_psc['data.name_elements.forename'] = tempname[0]
 #               df_psc['data.name_elements.surname'] = tempname[1]