# Companies House cultural data

Extracting data for cultural organisations from Companies House data based on SIC code.

## Setup environment

In [1]:
import json

import duckdb
import pandas as pd
import geopandas as gpd
from pipeline_utils.filesystem.paths import DATA, RAW_DATA, SITE

In [2]:
OUT = SITE / 'data/companies-house/_data/'
OUT.mkdir(exist_ok=True, parents=True)

## Extract Companies House data

Using a  shortlist of SIC codes, extract data from Companies House.

In [3]:
db = duckdb.connect(RAW_DATA / 'company-data.db', read_only=True)

Create temporary tables

In [4]:
db.query('''
    CREATE TEMP TABLE tSicCodes AS SELECT * FROM read_csv('../raw/sic_codes.csv');
''')

In [5]:
db.query('''
    CREATE TEMP TABLE tPostcodes AS
        SELECT pcds AS postcode, lat, long
        FROM read_csv('../data/reference/onspd_extract.csv')
        WHERE oslaua == 'E08000021';
''')

In [6]:
db.query('''
    CREATE TEMP TABLE tCompanies AS SELECT 
        CompanyName as registered_name,
        CompanyNumber as company_number,
        "RegAddress.PostCode" as postcode,
        [x for x in [
            "SICCode.SicText_1",
            "SICCode.SicText_2",
            "SICCode.SicText_3",
            "SICCode.SicText_4"
        ] if x is not NULL] as sic_code,
        lat, long
    FROM CompanyData c
    JOIN tPostcodes p
    ON c."RegAddress.PostCode" == p.postcode;
''')

Query the shortlist codes

In [7]:
shortlist_sic_codes = db.query('''SELECT * FROM tSicCodes;''').df()

Query the companies data

In [8]:
culture_companies = db.query('''
    SELECT DISTINCT c.*
        FROM tCompanies c
        JOIN tSicCodes s
        ON list_contains(c.sic_code, s.sic_code)
        ORDER BY c.company_number;
''').df()

In [9]:
db.close()

## Save the companies data

In [10]:
culture_companies.sort_values('company_number').to_csv(OUT / 'list.csv', index=False)

In [11]:
geo = gpd.GeoDataFrame(culture_companies, geometry=gpd.points_from_xy(culture_companies.long, culture_companies.lat))

geo.set_crs(None).to_file(OUT / 'company_locations.geojson')



## Process SIC codes

In [12]:
all_sic_codes = culture_companies.explode('sic_code').groupby('sic_code').company_number.count().sort_values(ascending=False).reset_index().rename(columns={
    'company_number': 'count'
})
all_sic_codes[all_sic_codes.sic_code.isin(shortlist_sic_codes.sic_code)].to_csv(OUT / 'sic_codes.csv')

## Calculate linked SIC codes

In [13]:
d = culture_companies.loc[:, ['company_number', 'sic_code']]
d['fingerprint'] = d.sic_code.apply(lambda x: '|'.join(sorted(x)))

In [14]:
nodes = all_sic_codes.to_dict(orient='records')

In [15]:
sources = [n['sic_code'] for n in nodes]

In [16]:
network = culture_companies.loc[:, ['sic_code', 'company_number']]

def count_others(v):
    edges = (
        network[network.sic_code.apply(lambda x: v in x)]
        .explode('sic_code')
        .groupby('sic_code')
        .company_number.count()
        .reset_index()
        .rename(columns={'company_number': 'weight', 'sic_code': 'dst'})\
    )
    edges['src'] = v

    return edges.loc[(edges.src != edges.dst) & (edges.weight > 1), ['src', 'dst', 'weight']].sort_values('weight', ascending=False)

directed_pairs = pd.concat([count_others(s) for s in sources]).reset_index(drop=True)

Edges will contain 

In [17]:
pairs = pd.DataFrame(
    directed_pairs[['src', 'dst']]
        .stack()
        .reset_index(level=1, drop=True)
        .pipe(lambda e: e.groupby(e.index).apply(lambda x: sorted(list(x))))
        .pipe(lambda e: e[e.duplicated()])
        .reset_index(drop=True)
        .tolist(),
    columns=['src', 'dst']
)

edges = pairs.merge(directed_pairs).sort_values('weight', ascending=False).to_dict(orient='records')

In [18]:
with open(OUT / 'graph.json', 'w') as graph:
    json.dump({ 'nodes': nodes, 'edges': edges }, fp=graph, indent=2)