# Landscape Map - Stage 2

This stage attempts to cleanse the data, match to other sources of data and tag with categories (such as Individuals).

In [7]:
import duckdb
import pandas as pd
from thefuzz.process import extractOne, extractBests

## Make direct matches to the company data

First, lets load the company data database.

In [8]:
db = duckdb.connect('../raw/company-data.db', read_only=True)

Then we'll load our raw longlist into a temporary table.

In [9]:
db.sql('''CREATE TEMP TABLE tRaw AS SELECT DISTINCT organisation FROM read_csv('../raw/landscape-longlist-raw.csv');''')

In [10]:
db.sql('''SELECT COUNT(*) AS Count FROM tRaw''')

┌───────┐
│ Count │
│ int64 │
├───────┤
│   502 │
└───────┘

We'll create a table of direct matches

In [11]:
db.sql('''
       CREATE TEMP TABLE tDirect as SELECT r.*,
              CompanyName as registered_name,
              CompanyNumber as company_number,
              "URI" as uri,
              "RegAddress.PostTown" as post_town,
              "RegAddress.PostCode" as postcode,
              CompanyCategory as company_category,
              CompanyStatus as company_status,
              [x for x in [
                     "SICCode.SicText_1",
                     "SICCode.SicText_2",
                     "SICCode.SicText_3",
                     "SICCode.SicText_4"
              ] if x is not NULL] as sic_code,
              IncorporationDate as incorporation_date,
              DissolutionDate as dissolution_date,
              "Accounts.AccountCategory" as accounts_category
              -- , c.*
                        
       FROM tRaw r LEFT JOIN CompanyData c
       ON upper(r.organisation) == c.CompanyName;
''')

In [12]:
direct_matches = db.sql('SELECT * from tDirect WHERE company_number IS NOT NULL').df()

In [13]:
db.close()

In [14]:
direct_matches.sort_values(by='organisation').to_csv('../raw/landscape-map-company-data.csv', index=False)

## Fix typos in longlist

Having matched the details, let's see if we can fuzzy match missing items in the longlist.

First, let's get a list of organisations that have been matched to Company House data.

In [15]:
matched_organisations = direct_matches.organisation.unique().tolist()

Then load the raw longlist

In [16]:
raw = pd.read_csv('../raw/landscape-longlist-raw.csv')

In [17]:
corrections = pd.concat(
    [
        raw,
        raw.organisation.map(
            lambda x: extractOne(x, matched_organisations, score_cutoff=90)
        ).apply(
            pd.Series, index=['match', 'score']
        )
    ], axis=1
).query(
    'score.notna() and score < 100'
).loc[: ,['organisation', 'match']].set_index('organisation')
corrections

Unnamed: 0_level_0,match
organisation,Unnamed: 1_level_1
Monkfish Productions CIC,Monkfish Productions CIO
Moving Parts Arts,Moving Parts Arts CIO
tiny dragon Productins,tiny dragon Productions


In [18]:
corrections.to_csv('../raw/landscape-map-corrections.csv')

## Fuzzy match company data

In [19]:
drop_list = pd.concat([direct_matches, corrections.reset_index()]).organisation

In [20]:
db = duckdb.connect('../raw/company-data.db', read_only=True)

In [21]:
companies = db.query('''
                        SELECT DISTINCT CompanyName, CompanyNumber FROM CompanyData
                        WHERE CompanyStatus == 'Active'
                        AND "RegAddress.PostCode" SIMILAR TO '(NE|DH|SR).*'
                        ORDER BY CompanyName;
''').df()

In [22]:
db.close()

In [23]:
candidates = pd.read_csv('../raw/landscape-longlist-raw.csv', usecols=[0]).organisation

In [24]:
candidate_list = candidates[~candidates.isin(drop_list)].unique().tolist()
def matcher(c, choices):
    return extractBests(c, choices, score_cutoff=80)


In [40]:
matches = companies.CompanyName.str.replace(r'\W+', ' ', regex=True).apply(matcher, choices=candidate_list).rename('Matches')

In [48]:
res = pd.concat([companies, matches], axis=1).explode('Matches').dropna()
res['match'], res['score'] = zip(*res.Matches)

fuzzy_matches = res.loc[res.score > 90, ['match', 'CompanyName', 'CompanyNumber', 'score']]
fuzzy_matches['type'] = 'fuzzy'
fuzzy_matches.set_index('match').to_csv('../raw/landscape_fuzzy.csv')

## Identify possible individuals