In [1]:
# Execute this first, it creates multiple test data sets from a sample file.

In [2]:
import numpy as np
import pandas as pd

Data
==

Create directories
--

In [3]:
%%capture
!mkdir "data/interim"
!mkdir "data/processed"

Load & prepare test data
--

In [4]:
# From Splink example data.
# https://github.com/moj-analytical-services/splink_demos/tree/master/data
d = pd.read_parquet('data/raw/historical_figures_with_errors_50k.parquet')
d.head()

Unnamed: 0,unique_id,cluster,full_name,first_and_surname,first_name,surname,dob,birth_place,postcode_fake,gender,occupation
0,Q2296770-1,Q2296770,"thomas clifford, 1st baron clifford of chudleigh",thomas chudleigh,thomas,chudleigh,1630-08-01,devon,tq13 8df,male,politician
1,Q2296770-2,Q2296770,thomas of chudleigh,thomas chudleigh,thomas,chudleigh,1630-08-01,devon,tq13 8df,male,politician
2,Q2296770-3,Q2296770,tom 1st baron clifford of chudleigh,tom chudleigh,tom,chudleigh,1630-08-01,devon,tq13 8df,male,politician
3,Q2296770-4,Q2296770,thomas 1st chudleigh,thomas chudleigh,thomas,chudleigh,1630-08-01,devon,tq13 8hu,,politician
4,Q2296770-5,Q2296770,"thomas clifford, 1st baron chudleigh",thomas chudleigh,thomas,chudleigh,1630-08-01,devon,tq13 8df,,politician


In [5]:
# Feature vector
features = d.rename(columns={
    'birth_place': 'city',
    'postcode_fake': 'postcode'
})[[
    'unique_id', 

    # Address
    'city', 'postcode', 
    
    # Name columns
    'full_name', 'first_and_surname', 'first_name', 'surname', 
    
    # Other info
    'dob', 'gender',
    
    # For eval
    'cluster', 
]]

# Randomly assign records to a table for our match
features['segment'] = np.random.randint(0,4, size=len(d)) + 1

# Summary
features.segment.value_counts()

segment
1    12727
3    12696
4    12671
2    12484
Name: count, dtype: int64

In [6]:
features.head()

Unnamed: 0,unique_id,city,postcode,full_name,first_and_surname,first_name,surname,dob,gender,cluster,segment
0,Q2296770-1,devon,tq13 8df,"thomas clifford, 1st baron clifford of chudleigh",thomas chudleigh,thomas,chudleigh,1630-08-01,male,Q2296770,3
1,Q2296770-2,devon,tq13 8df,thomas of chudleigh,thomas chudleigh,thomas,chudleigh,1630-08-01,male,Q2296770,3
2,Q2296770-3,devon,tq13 8df,tom 1st baron clifford of chudleigh,tom chudleigh,tom,chudleigh,1630-08-01,male,Q2296770,1
3,Q2296770-4,devon,tq13 8hu,thomas 1st chudleigh,thomas chudleigh,thomas,chudleigh,1630-08-01,,Q2296770,3
4,Q2296770-5,devon,tq13 8df,"thomas clifford, 1st baron chudleigh",thomas chudleigh,thomas,chudleigh,1630-08-01,,Q2296770,2


Export
--

In [7]:
# Including any duplicates
for segment in features.segment.unique():
    table = features[features.segment==segment].drop(columns='segment')
    table.to_csv(f'data/interim/table{segment}_with_duplicates.csv', index=False)

In [8]:
# Only keep one record per table and cluster -- no duplicates
features = features.drop_duplicates(subset=['segment', 'cluster'])
features.segment.value_counts()

segment
1    4413
3    4408
2    4404
4    4400
Name: count, dtype: int64

In [9]:
# No duplicates
for segment in features.segment.unique():
    table = features[features.segment==segment].drop(columns='segment')
    table.to_csv(f'data/interim/table{segment}.csv', index=False)