# Landscape map - stage 3

This stage merges the raw longlist with the candidate data from the reference sources to create the new list.

It round-trips the data from the existing landscape list to ensure that any manual overrides are included.

In [1]:
from pathlib import Path

import petl as etl
from pipeline_utils.reference.geo import la_code_lookup

In [2]:
RAW_DATA = Path('../raw/')
DATA = Path('../data/')

Load the spelling corrections we have inferred from the matching stage.

In [3]:
corrections = etl.fromcsv(RAW_DATA / 'landscape-map-corrections.csv').lookupone('organisation', 'match')

Load the untagged longlist from the raw directory and perform the following operations:

1. Convert numeric data to numbers
2. Correct the spellings of the organisational data
3. Augment with local authority data

In [4]:
raw = etl.fromcsv(
    RAW_DATA / 'landscape-longlist-raw.csv'
).convertnumbers(
).convert(
    'organisation', lambda x: corrections.get(x, x)
).convert(
    'Local authority', la_code_lookup
).unpackdict(
    'Local authority'
).cache()

raw

organisation,Source,Number,LAD24CD,LAD24NM
4M Puppets,Project Grant,1,E08000021,Newcastle upon Tyne
Abdulrahman Abu - Zayd,Project Grant,1,E08000037,Gateshead
Action for Children,Project Grant,1,E06000057,Northumberland
Adam Phillips,Project Grant,3,E08000037,Gateshead
Adam Shield,Project Grant,1,E08000021,Newcastle upon Tyne


Get the list of sources in the longlist. We'll use this to update the values in the columns later on.

In [5]:
sources = tuple(raw.cut('Source').distinct().values('Source'))
sources

('IPSO', 'NPO', 'Project Grant')

Recast the longlist to convert the Source column into a column per entry, and convert any non-None values into True

In [6]:
wide_table = raw.recast(
    variablefield="Source",
    valuefield="Number"
).convert(
    sources,
    lambda x: True if x is not None else None
)
wide_table

organisation,LAD24CD,LAD24NM,IPSO,NPO,Project Grant
4M Puppets,E08000021,Newcastle upon Tyne,,,True
Abdulrahman Abu - Zayd,E08000037,Gateshead,,,True
Action for Children,E06000057,Northumberland,,,True
Adam Phillips,E08000037,Gateshead,,,True
Adam Shield,E08000021,Newcastle upon Tyne,,,True


At this point we will also add in new data from the result of stage 2.

1. `location` Manually set locations 
2. `company_data` Company data from Companies house 

In [7]:
location = etl.fromcsv(RAW_DATA / 'landscape-locations.csv').lookupone('organisation', ['latitude', 'longitude'])
company_data = etl.fromcsv(RAW_DATA / 'landscape-map-company-data.csv').dictlookupone('organisation')

Create the new landscape table

In [8]:
landscape = (
    wide_table
    .addfield('location', lambda r: location.get(r.organisation, ()))
    .unpack('location', newfields=['latitude', 'longitude'])
    .addfield('company_data', lambda r: company_data.get(r.organisation, {}))
    .unpackdict('company_data', keys=[
        'company_category',
        'company_number',
        'company_status',
        'dissolution_date',
        'incorporation_date',
        'post_town',
        'postcode',
        'sic_code',
        'uri'
    ])
)
landscape

organisation,LAD24CD,LAD24NM,IPSO,NPO,Project Grant,latitude,longitude,company_category,company_number,company_status,dissolution_date,incorporation_date,post_town,postcode,sic_code,uri
4M Puppets,E08000021,Newcastle upon Tyne,,,True,,,,,,,,,,,
Abdulrahman Abu - Zayd,E08000037,Gateshead,,,True,,,,,,,,,,,
Action for Children,E06000057,Northumberland,,,True,,,"PRI/LBG/NSC (Private, Limited by guarantee, no share capital, use of 'Limited' exemption)",4764232.0,Active,,2003-05-14,WATFORD,WD18 8AG,['87900 - Other residential care activities n.e.c.'  '88990 - Other social work activities without accommodation n.e.c.'],http://business.data.gov.uk/id/company/04764232
Adam Phillips,E08000037,Gateshead,,,True,,,,,,,,,,,
Adam Shield,E08000021,Newcastle upon Tyne,,,True,,,,,,,,,,,


Finally, write the CSV file

In [9]:
landscape.tocsv(DATA / 'culture_landscape.csv')