# Companies House cultural data

Extracting data for cultural organisations from Companies House data based on SIC code.

## Setup environment

In [19]:
import json

import duckdb
import pandas as pd
import geopandas as gpd
from pipeline_utils.filesystem.paths import DATA, RAW_DATA, SITE

In [20]:
OUT = SITE / 'data/companies-house/_data/'
OUT.mkdir(exist_ok=True, parents=True)

## Extract Companies House data

Using a  shortlist of SIC codes, extract data from Companies House.

In [21]:
db = duckdb.connect(RAW_DATA / 'company-data.db', read_only=True)

Create temporary tables

In [22]:
db.query('''
    CREATE TEMP TABLE tSicCodes AS SELECT * FROM read_csv('../raw/sic_codes.csv');
''')

In [23]:
db.query('''
    CREATE TEMP TABLE tPostcodes AS
        SELECT pcds AS postcode, lat, long
        FROM read_csv('../data/reference/onspd_extract.csv')
        WHERE oslaua == 'E08000021';
''')

In [24]:
db.query('''
    CREATE TEMP TABLE tCompanies AS SELECT 
        CompanyName as registered_name,
        CompanyNumber as company_number,
        "RegAddress.PostCode" as postcode,
        [x for x in [
            "SICCode.SicText_1",
            "SICCode.SicText_2",
            "SICCode.SicText_3",
            "SICCode.SicText_4"
        ] if x is not NULL] as sic_code,
        lat, long
    FROM CompanyData c
    JOIN tPostcodes p
    ON c."RegAddress.PostCode" == p.postcode;
''')

Query the shortlist codes

In [25]:
shortlist_sic_codes = db.query('''SELECT * FROM tSicCodes;''').df()

Query the companies data

In [26]:
culture_companies = db.query('''
    SELECT DISTINCT c.*
        FROM tCompanies c
        JOIN tSicCodes s
        ON list_contains(c.sic_code, s.sic_code)
        ORDER BY c.company_number;
''').df()

In [27]:
db.close()

## Save the companies data

In [28]:
culture_companies.sort_values('company_number').to_csv(OUT / 'list.csv', index=False)

In [41]:
geo = gpd.GeoDataFrame(culture_companies, geometry=gpd.points_from_xy(culture_companies.long, culture_companies.lat))

geo.set_crs(None).to_file(OUT / 'company_locations.geojson')



## Process SIC codes

In [11]:
all_sic_codes = culture_companies.explode('sic_code').groupby('sic_code').company_number.count().sort_values(ascending=False).reset_index().rename(columns={
    'company_number': 'count'
})
all_sic_codes[all_sic_codes.sic_code.isin(shortlist_sic_codes.sic_code)].to_csv(OUT / 'sic_codes.csv')

## Calculate linked SIC codes

In [12]:
d = culture_companies.loc[:, ['company_number', 'sic_code']]
d['fingerprint'] = d.sic_code.apply(lambda x: '|'.join(sorted(x)))

In [13]:
nodes = all_sic_codes.to_dict(orient='records')

In [14]:
json.dumps(nodes)

'[{"sic_code": "90030 - Artistic creation", "count": 123}, {"sic_code": "90010 - Performing arts", "count": 93}, {"sic_code": "93290 - Other amusement and recreation activities n.e.c.", "count": 76}, {"sic_code": "59112 - Video production activities", "count": 72}, {"sic_code": "90020 - Support activities to performing arts", "count": 64}, {"sic_code": "59111 - Motion picture production activities", "count": 52}, {"sic_code": "85520 - Cultural education", "count": 49}, {"sic_code": "90040 - Operation of arts facilities", "count": 29}, {"sic_code": "59200 - Sound recording and music publishing activities", "count": 19}, {"sic_code": "85600 - Educational support services", "count": 15}, {"sic_code": "85590 - Other education n.e.c.", "count": 14}, {"sic_code": "74202 - Other specialist photography", "count": 11}, {"sic_code": "74100 - specialised design activities", "count": 10}, {"sic_code": "59113 - Television programme production activities", "count": 9}, {"sic_code": "91020 - Museums 

In [15]:
pd.Series(d.groupby('fingerprint').company_number.count(), name='count').sort_values(ascending=False)

fingerprint
90030 - Artistic creation                                                                                                                                                                             70
93290 - Other amusement and recreation activities n.e.c.                                                                                                                                              60
90010 - Performing arts                                                                                                                                                                               59
90020 - Support activities to performing arts                                                                                                                                                         35
59111 - Motion picture production activities                                                                                                                                            

In [16]:
d.pivot(index="company_number", columns="sic_code", values='count')

TypeError: unhashable type: 'numpy.ndarray'