# Landscape Map - Stage 2

This stage attempts to cleanse the data, match to other sources of data and tag with categories (such as Individuals).

In [22]:
import duckdb
import pandas as pd
from thefuzz.process import extractOne, extractBests
from pipeline_utils.filesystem.paths import RAW_DATA

## Make direct matches to the company data

First, lets load the company data database.

In [23]:
db = duckdb.connect(RAW_DATA / 'company-data.db', read_only=True)

Then we'll load our raw longlist into a temporary table.

In [24]:
db.sql('''CREATE TEMP TABLE tRaw AS SELECT DISTINCT organisation FROM read_csv('../raw/landscape-longlist-raw.csv');''')

In [25]:
db.sql('''SELECT COUNT(*) AS Count FROM tRaw''')

┌───────┐
│ Count │
│ int64 │
├───────┤
│   502 │
└───────┘

We'll create a table of direct matches

In [26]:
db.sql('''
       CREATE TEMP TABLE tDirect as SELECT r.*,
              CompanyName as registered_name,
              CompanyNumber as company_number,
              "URI" as uri,
              "RegAddress.PostTown" as post_town,
              "RegAddress.PostCode" as postcode,
              CompanyCategory as company_category,
              CompanyStatus as company_status,
              [x for x in [
                     "SICCode.SicText_1",
                     "SICCode.SicText_2",
                     "SICCode.SicText_3",
                     "SICCode.SicText_4"
              ] if x is not NULL] as sic_code,
              IncorporationDate as incorporation_date,
              DissolutionDate as dissolution_date,
              "Accounts.AccountCategory" as accounts_category
              -- , c.*
                        
       FROM tRaw r LEFT JOIN CompanyData c
       ON upper(r.organisation) == c.CompanyName;
''')

In [27]:
direct_matches = db.sql('SELECT * from tDirect WHERE company_number IS NOT NULL').df()

In [28]:
db.close()

In [29]:
direct_matches.sort_values(by='organisation').to_csv(RAW_DATA / 'landscape-map-company-data.csv', index=False)

### Extract a list of SIC Codes

In [30]:
pd.Series(
    direct_matches.sic_code.explode().unique()
    , name="sic_code"
).sort_values().pipe(
    lambda s: s.loc[~s.isin([
        'None Supplied',
        '56302 - Public houses and bars',
        '82990 - Other business support service activities n.e.c.',
        '85590 - Other education n.e.c.',
        '85600 - Educational support services',
        '87900 - Other residential care activities n.e.c.',
        '88100 - Social work activities without accommodation for the elderly and disabled',
        '88990 - Other social work activities without accommodation n.e.c.',
        '93120 - Activities of sport clubs',
        '96090 - Other service activities n.e.c.',
    ])]
).to_csv(
    RAW_DATA / 'sic_codes.csv', index=False
)

## Fix typos in longlist

Having matched the details, let's see if we can fuzzy match missing items in the longlist.

First, let's get a list of organisations that have been matched to Company House data.

In [31]:
matched_organisations = direct_matches.organisation.unique().tolist()

Then load the raw longlist

In [32]:
raw = pd.read_csv(RAW_DATA / 'landscape-longlist-raw.csv')

In [33]:
corrections = pd.concat(
    [
        raw,
        raw.organisation.map(
            lambda x: extractOne(x, matched_organisations, score_cutoff=90)
        ).apply(
            pd.Series, index=['match', 'score']
        )
    ], axis=1
).query(
    'score.notna() and score < 100'
).loc[: ,['organisation', 'match']].set_index('organisation')
corrections

Unnamed: 0_level_0,match
organisation,Unnamed: 1_level_1
Monkfish Productions CIC,Monkfish Productions CIO
Moving Parts Arts,Moving Parts Arts CIO
tiny dragon Productins,tiny dragon Productions


In [34]:
corrections.to_csv(RAW_DATA / 'landscape-map-corrections.csv')

## Fuzzy match company data

In [35]:
drop_list = pd.concat([direct_matches, corrections.reset_index()]).organisation

In [36]:
db = duckdb.connect(RAW_DATA / 'company-data.db', read_only=True)

In [37]:
companies = db.query('''
                        SELECT DISTINCT CompanyName, CompanyNumber FROM CompanyData
                        WHERE CompanyStatus == 'Active'
                        AND "RegAddress.PostCode" SIMILAR TO '(NE|DH|SR).*'
                        ORDER BY CompanyName;
''').df()

In [38]:
db.close()

In [39]:
candidates = pd.read_csv(RAW_DATA / 'landscape-longlist-raw.csv', usecols=[0]).organisation

In [40]:
candidate_list = candidates[~candidates.isin(drop_list)].unique().tolist()
def matcher(c, choices):
    return extractBests(c, choices, score_cutoff=80)


In [41]:
matches = companies.CompanyName.str.replace(r'\W+', ' ', regex=True).apply(matcher, choices=candidate_list).rename('Matches')

In [42]:
res = pd.concat([companies, matches], axis=1).explode('Matches').dropna()
res['match'], res['score'] = zip(*res.Matches)

fuzzy_matches = res.loc[res.score > 90, ['match', 'CompanyName', 'CompanyNumber', 'score']]
fuzzy_matches['type'] = 'fuzzy'
fuzzy_matches.set_index('match').to_csv(RAW_DATA / 'landscape_fuzzy.csv')

## Identify possible individuals