# Landscape map - stage 3

This stage merges the raw longlist with the candidate data from the reference sources to create the new list.

It round-trips the data from the existing landscape list to ensure that any manual overrides are included.

In [10]:
import duckdb
import petl as etl
from pipeline_utils.reference.geo import la_code_lookup
from pipeline_utils.reference.onspd import normalise_postcode, postcode_lookup
from pipeline_utils.filesystem.paths import RAW_DATA, DATA

Load the spelling corrections we have inferred from the matching stage.

In [11]:
corrections = etl.fromcsv(RAW_DATA / 'landscape-map-corrections.csv').lookupone('organisation', 'match')

Load the untagged longlist from the raw directory and perform the following operations:

1. Convert numeric data to numbers
2. Correct the spellings of the organisational data
3. Augment with local authority data

In [12]:
raw = etl.fromcsv(
    RAW_DATA / 'landscape-longlist-raw.csv'
).convertnumbers(
).convert(
    'organisation', lambda x: corrections.get(x, x)
).convert(
    'Local authority', la_code_lookup
).unpackdict(
    'Local authority'
).rename({
    'LAD24CD': 'funding_geo_code',
    'LAD24NM': 'funding_geo_name',
}).cache()

raw

organisation,Source,Number,funding_geo_code,funding_geo_name
4M Puppets,Project Grant,1,E08000021,Newcastle upon Tyne
Abdulrahman Abu - Zayd,Project Grant,1,E08000037,Gateshead
Action for Children,Project Grant,1,E06000057,Northumberland
Adam Phillips,Project Grant,3,E08000037,Gateshead
Adam Shield,Project Grant,1,E08000021,Newcastle upon Tyne


Get the list of sources in the longlist. We'll use this to update the values in the columns later on.

In [13]:
sources = tuple(raw.cut('Source').distinct().values('Source'))
sources

('IPSO', 'NPO', 'Project Grant')

Recast the longlist to convert the Source column into a column per entry, and convert any non-None values into True

In [14]:
wide_table = raw.recast(
    variablefield="Source",
    valuefield="Number"
).convert(
    sources,
    lambda x: True if x is not None else None
)
wide_table

organisation,funding_geo_code,funding_geo_name,IPSO,NPO,Project Grant
4M Puppets,E08000021,Newcastle upon Tyne,,,True
Abdulrahman Abu - Zayd,E08000037,Gateshead,,,True
Action for Children,E06000057,Northumberland,,,True
Adam Phillips,E08000037,Gateshead,,,True
Adam Shield,E08000021,Newcastle upon Tyne,,,True


At this point we will also add in new data from the result of stage 2.

1. `location` Manually set locations 
2. `direct` Company data from Companies house 
3. `fuzzy` Fuzzily matched company data

In [15]:
location = etl.fromcsv(RAW_DATA / 'landscape-locations.csv').lookupone('organisation', ['latitude', 'longitude'])
direct = etl.fromcsv(RAW_DATA / 'landscape-map-company-data.csv')

Fuzzy data is loaded from the database.

TODO change the direct matches to the same approach.

In [16]:
db = duckdb.connect(RAW_DATA / 'company-data.db', read_only=True)
db.query('''
        CREATE TEMP TABLE tFuzzy AS SELECT * FROM read_csv('../raw/landscape_fuzzy.csv');
         ''')

fuzzy = etl.fromdataframe(
    db.query('''
             SELECT f.match as organisation,
                f.CompanyName as registered_name,
                f.CompanyNumber as company_number,
                type,
                score,
                "URI" as uri,
                "RegAddress.PostTown" as post_town,
                "RegAddress.PostCode" as postcode,
                CompanyCategory as company_category,
                CompanyStatus as company_status,
                [x for x in [
                        "SICCode.SicText_1",
                        "SICCode.SicText_2",
                        "SICCode.SicText_3",
                        "SICCode.SicText_4"
                ] if x is not NULL] as sic_code,
                IncorporationDate as incorporation_date,
                DissolutionDate as dissolution_date,
                "Accounts.AccountCategory" as accounts_category
             FROM tFuzzy f
             JOIN CompanyData c
             ON f.CompanyNumber == c.CompanyNumber;
             ''').df())
db.close()

In [17]:
company_data = direct.cat(fuzzy).dictlookupone('organisation')

Create the new landscape table

In [33]:
landscape = (
    wide_table
    .addfield('location', lambda r: location.get(r.organisation, ()))
    .unpack('location', newfields=['latitude', 'longitude'])
    .addfield('company_data', lambda r: company_data.get(r.organisation, {}))
    .unpackdict('company_data', keys=[
        'company_category',
        'accounts_category',
        'company_number',
        'company_status',
        'dissolution_date',
        'incorporation_date',
        'post_town',
        'postcode',
        'sic_code',
        'uri',
        'type', 'score'
    ])
    .convert('postcode', normalise_postcode)
    .convert('postcode', lambda x: postcode_lookup.get(x, { 'pcds': x }))
    .unpackdict('postcode', keys=['pcds', 'lat', 'long', 'oslaua'])
    .convert('latitude', lambda x, r: r['lat'], pass_row=True, where=lambda r: r['latitude'] == None and r['lat'] != None)
    .convert('longitude', lambda x, r: r['long'], pass_row=True, where=lambda r: r['longitude'] == None and r['long'] != None)
    .cutout('lat', 'long')
    .sort('organisation')
)
landscape

organisation,funding_geo_code,funding_geo_name,IPSO,NPO,Project Grant,latitude,longitude,company_category,accounts_category,company_number,company_status,dissolution_date,incorporation_date,post_town,sic_code,uri,type,score,pcds,oslaua
4M Puppets,E08000021,Newcastle upon Tyne,,,True,,,,,,,,,,,,,,,
Abdulrahman Abu - Zayd,E08000037,Gateshead,,,True,,,,,,,,,,,,,,,
Action for Children,E06000057,Northumberland,,,True,,,"PRI/LBG/NSC (Private, Limited by guarantee, no share capital, use of 'Limited' exemption)",GROUP,4764232.0,Active,,2003-05-14,WATFORD,['87900 - Other residential care activities n.e.c.'  '88990 - Other social work activities without accommodation n.e.c.'],http://business.data.gov.uk/id/company/04764232,,,WD18 8AG,
Adam Phillips,E08000037,Gateshead,,,True,,,,,,,,,,,,,,,
Adam Shield,E08000021,Newcastle upon Tyne,,,True,,,,,,,,,,,,,,,


Finally, write the CSV file

In [34]:
landscape.tocsv(DATA / 'culture_landscape.csv')