# Companies House cultural data

Extracting data for cultural organisations from Companies House data based on SIC code.

## Setup environment

In [1]:
import json

import duckdb
import pandas as pd
import geopandas as gpd
from pipeline_utils.filesystem.paths import DATA, RAW_DATA, SITE

In [2]:
OUT = SITE / 'data/companies-house/_data/'
OUT.mkdir(exist_ok=True, parents=True)

## Extract Companies House data

Using a  shortlist of SIC codes, extract data from Companies House.

In [3]:
db = duckdb.connect(RAW_DATA / 'company-data.db', read_only=True)

Create temporary tables

In [4]:
db.query('''
    CREATE TEMP TABLE tSicCodes AS SELECT * FROM read_csv('../raw/sic_codes.csv');
''')

In [5]:
db.query('''
    CREATE TEMP TABLE tPostcodes AS
        SELECT pcds AS postcode, lat, long
        FROM read_csv('../data/reference/onspd_extract.csv')
        WHERE oslaua == 'E08000021';
''')

In [6]:
db.query('''
    CREATE TEMP TABLE tCompanies AS SELECT 
        CompanyName as registered_name,
        CompanyNumber as company_number,
        "RegAddress.PostCode" as postcode,
        [x for x in [
            "SICCode.SicText_1",
            "SICCode.SicText_2",
            "SICCode.SicText_3",
            "SICCode.SicText_4"
        ] if x is not NULL] as sic_code,
        lat, long
    FROM CompanyData c
    JOIN tPostcodes p
    ON c."RegAddress.PostCode" == p.postcode;
''')

Query the shortlist codes

In [7]:
shortlist_sic_codes = db.query('''SELECT * FROM tSicCodes;''').df()

Query the companies data

In [8]:
culture_companies = db.query('''
    SELECT DISTINCT c.*
        FROM tCompanies c
        JOIN tSicCodes s
        ON list_contains(c.sic_code, s.sic_code)
        ORDER BY c.company_number;
''').df()

In [9]:
db.close()

In [10]:
culture_companies

Unnamed: 0,registered_name,company_number,postcode,sic_code,lat,long
0,PEOPLE'S THEATRE ARTS GROUP LIMITED,00242886,NE6 5QF,"[90010 - Performing arts, 90030 - Artistic cre...",54.990445,-1.584953
1,INSTITUTE OF AMATEUR CINEMATOGRAPHERS LIMITED(...,00269085,NE3 2DT,[63990 - Other information service activities ...,55.023848,-1.620913
2,NEWCASTLE PEOPLE'S THEATRE ARTS TRUST LIMITED,00393739,NE6 5QF,[90040 - Operation of arts facilities],54.990445,-1.584953
3,MAWSON & WAREHAM (MUSIC) LIMITED,00957980,NE1 5BP,"[58190 - Other publishing activities, 59112 - ...",54.973132,-1.613142
4,TYNESIDE CINEMA,01113101,NE1 6QG,[59140 - Motion picture projection activities],54.973801,-1.611761
...,...,...,...,...,...,...
432,HARVEY DUCKMAN LTD,15879932,NE1 1JF,[90030 - Artistic creation],54.969968,-1.613654
433,BUNNY RWLK LTD,15881018,NE1 5UD,"[90010 - Performing arts, 90030 - Artistic cre...",54.971137,-1.618832
434,AVERIA AGENCY UK LTD,15888622,NE3 1YQ,"[63120 - Web portals, 90030 - Artistic creation]",55.002906,-1.608369
435,GEORDIE VISION LTD,15906324,NE6 2HL,[59112 - Video production activities],54.970149,-1.581302


## Save the companies data

In [11]:
culture_companies.sort_values('company_number').to_csv(OUT / 'list.csv', index=False)

In [12]:
geo = gpd.GeoDataFrame(culture_companies, geometry=gpd.points_from_xy(culture_companies.long, culture_companies.lat))

geo.set_crs(None).to_file(OUT / 'company_locations.geojson')



## Process SIC codes

In [13]:
all_sic_codes = culture_companies.explode('sic_code').groupby('sic_code').company_number.count().sort_values(ascending=False).reset_index().rename(columns={
    'company_number': 'count'
})
all_sic_codes[all_sic_codes.sic_code.isin(shortlist_sic_codes.sic_code)].to_csv(OUT / 'sic_codes.csv')

In [14]:
all_sic_codes

Unnamed: 0,sic_code,count
0,90030 - Artistic creation,123
1,90010 - Performing arts,93
2,59112 - Video production activities,72
3,90020 - Support activities to performing arts,64
4,59111 - Motion picture production activities,52
...,...,...
100,85320 - Technical and vocational secondary edu...,1
101,85410 - Post-secondary non-tertiary education,1
102,93130 - Fitness facilities,1
103,94120 - Activities of professional membership ...,1


## Calculate linked SIC codes

In [15]:
d = culture_companies.loc[:, ['company_number', 'sic_code']]
d['fingerprint'] = d.sic_code.apply(lambda x: '|'.join(sorted(x)))

In [16]:
nodes = all_sic_codes.to_dict(orient='records')

In [17]:
sources = [n['sic_code'] for n in nodes]

In [18]:
network = culture_companies.loc[:, ['sic_code', 'company_number']]

def count_others(v):
    edges = (
        network[network.sic_code.apply(lambda x: v in x)]
        .explode('sic_code')
        .groupby('sic_code')
        .company_number.count()
        .reset_index()
        .rename(columns={'company_number': 'weight', 'sic_code': 'dst'})\
    )
    edges['src'] = v

    return edges.loc[(edges.src != edges.dst), ['src', 'dst', 'weight']].sort_values('weight', ascending=False)

directed_pairs = pd.concat([count_others(s) for s in sources]).reset_index(drop=True)

Edges will contain 

In [19]:
pairs = pd.DataFrame(
    directed_pairs[['src', 'dst']]
        .stack()
        .reset_index(level=1, drop=True)
        .pipe(lambda e: e.groupby(e.index).apply(lambda x: sorted(list(x))))
        .pipe(lambda e: e[e.duplicated()])
        .reset_index(drop=True)
        .tolist(),
    columns=['src', 'dst']
)

edges = pairs.merge(directed_pairs).sort_values('weight', ascending=False).to_dict(orient='records')

In [20]:
pd.DataFrame(edges)

Unnamed: 0,src,dst,weight
0,90010 - Performing arts,90030 - Artistic creation,15
1,85520 - Cultural education,85600 - Educational support services,13
2,85520 - Cultural education,85590 - Other education n.e.c.,12
3,90010 - Performing arts,90020 - Support activities to performing arts,12
4,59111 - Motion picture production activities,59112 - Video production activities,11
...,...,...,...
309,59113 - Television programme production activi...,74209 - Photographic activities not elsewhere ...,1
310,62011 - Ready-made interactive leisure and ent...,90030 - Artistic creation,1
311,"59120 - Motion picture, video and television p...",62011 - Ready-made interactive leisure and ent...,1
312,59112 - Video production activities,62011 - Ready-made interactive leisure and ent...,1


In [21]:
with open(OUT / 'graph.json', 'w') as graph:
    json.dump({ 'nodes': nodes, 'edges': edges }, fp=graph, indent=2)