In [1]:
import optuna as op
import pandas as pd
import numpy as np
from scipy.spatial.transform import Rotation
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

This is a slightly lower tech process that attempts to align the datasets via procrustes transformations. Each embedding is performed independently of the other embeddings. We then attempt to find rotations of the data that minimises the procrustes distance between datasets.

Suppose we have two sets of data, $X = {x_1, ..., x_n}$ and $Y = {y_1, ..., y_n}$ that are comparable, such that $x_i$ and $y_i$ are related.

Define the Procrustes distance: $D_p(X, Y) = \sum ||x_i - y_i||_2$

We seek to find a rotation and shift, $Y' = RY + C$ that minimises $D_p(X,Y')$. C is columnwise constant.

In [2]:
p2 = pd.read_csv('Data/TTI_Pillar2/SymptomsUMAP_loose_clusteringOrigin P2.csv', index_col=0)
sgss = pd.read_csv('Data/TTI_SGSS/SymptomsUMAP_loose_clusteringOrigin SGSS.csv', index_col=0)
css = pd.read_csv('Data/CovidSymptomStudy/UMAPLooseWide.csv', index_col=0)
cis = pd.read_csv('Data/CommunityInfectionSurvey/SymptomsUMAP_loose_clustering.csv', index_col=0)

In [3]:
class ProcrustesAlignment:

    def __init__(self, X: np.array, Y: np.array, X_mapping_idx: list[int], Y_mapping_idx: list[int]) -> None:
        self.X = X
        self.Y = Y
        self.X_mapping_idx = X_mapping_idx
        self.Y_mapping_idx = Y_mapping_idx
        self.optimized = False

    def compute_procrustes_distance(self, X: np.array, Y_dash: np.array) -> float:
        return np.sum(np.sqrt( ((X.T[self.X_mapping_idx,:] - Y_dash.T[self.Y_mapping_idx,:])**2).sum(axis = 1) ) )

    def get_rotation_matrix(self, theta) -> np.array:
        theta = np.radians(theta)
        c, s = np.cos(theta), np.sin(theta)
        return np.array(((c, -s), (s, c)))

    def transform_Y(self, theta: float, x_shift: float, y_shift: float) -> np.array:
        return np.matmul(self.get_rotation_matrix(theta), self.Y) + np.array([[x_shift], [y_shift]])

    def eval_transformation(self, theta: float, x_shift: float, y_shift: float) -> float:
        return self.compute_procrustes_distance(self.X, self.transform_Y(theta, x_shift, y_shift))

    def optimize(self, n_trials: int) -> np.array:
        self.study = op.create_study()
        self.study.optimize(self.objective, n_trials)
        self.optimized = True
        self.best_params = self.study.best_params

    def objective(self, trial):
        theta = trial.suggest_float('theta', 0, 360)
        x_shift = trial.suggest_float('x_shift', -10, 10)
        y_shift = trial.suggest_float('y_shift', -10, 10)
        return self.eval_transformation(theta, x_shift, y_shift)

    def get_optimal_rotation(self):
        if not self.optimized:
            print('Optimisation has not yet been performed.')
        else:
            return self.transform_Y(self.best_params['theta'], self.best_params['x_shift'], self.best_params['y_shift'])

We can only align symptoms that are shared across datasets, so we need to find the mapping from one dataset to the other.

In [4]:
# load the lookup into memory
symptom_name_category_lookup = pd.read_csv('Data/Lookups/SymptomNameCategoryLookup.csv')

# subset the lookup for each dataset
ctas_lookup = symptom_name_category_lookup[symptom_name_category_lookup.dataset == 'CTAS']
css_lookup = symptom_name_category_lookup[symptom_name_category_lookup.dataset == 'Zoe']
cis_lookup = symptom_name_category_lookup[symptom_name_category_lookup.dataset == 'ONS']

# create tables that contain only the raw symptom variable names in the dataset
p2_symptoms = pd.DataFrame(p2.columns, columns=['symptom'])
sgss_symptoms = pd.DataFrame(sgss.columns, columns=['symptom'])
css_symptoms = pd.DataFrame(css.columns, columns=['symptom'])
cis_symptoms = pd.DataFrame(cis.columns, columns=['symptom'])

# join to the lookup table, this allows us to map the symptoms between datasets
p2_symptoms = pd.merge(left = p2_symptoms, right = ctas_lookup, left_on = 'symptom', right_on='symptom_name_raw')[['symptom', 'symptom_id', 'symptom_name_formatted', 'category']]
sgss_symptoms = pd.merge(left = sgss_symptoms, right = ctas_lookup, left_on = 'symptom', right_on='symptom_name_raw')[['symptom', 'symptom_id', 'symptom_name_formatted', 'category']]
css_symptoms = pd.merge(left = css_symptoms, right = css_lookup, left_on = 'symptom', right_on='symptom_name_raw')[['symptom', 'symptom_id', 'symptom_name_formatted', 'category']]
cis_symptoms = pd.merge(left = cis_symptoms, right = cis_lookup, left_on = 'symptom', right_on='symptom_name_raw')[['symptom', 'symptom_id', 'symptom_name_formatted', 'category']]

# work out which ids are common across all datasets
symptom_ids = [
    p2_symptoms.symptom_id.values,
    sgss_symptoms.symptom_id.values,
    css_symptoms.symptom_id.values,
    cis_symptoms.symptom_id.values
]

shared_ids = symptom_ids[0]
for id_set in symptom_ids:
    shared_ids = np.intersect1d(shared_ids, id_set)

In [5]:
# convenience function for mapping symptoms in one data to the other
# need to provide a list of the symptoms that are common across all datasets
def get_mapping_indices(symptoms_from, symptoms_to, common_symptom_ids):

    from_index = []
    to_index = []

    for num_from, symptom_id_from in enumerate(symptoms_from.symptom_id.values):
        if symptom_id_from in common_symptom_ids:
            for num_to, symptom_id_to in enumerate(symptoms_to.symptom_id.values):
                
                if symptom_id_to == symptom_id_from:
                    
                    from_index.append(num_from)
                    to_index.append(num_to)

    return from_index, to_index

In [6]:
from_idx, to_idx = get_mapping_indices(p2_symptoms, css_symptoms, common_symptom_ids=shared_ids)

In [7]:
def align(dataset, dataset_symptoms):
    from_idx, to_idx = get_mapping_indices(p2_symptoms, dataset_symptoms, shared_ids)
    aligner = ProcrustesAlignment(X = p2.values, Y = dataset.values, X_mapping_idx=from_idx, Y_mapping_idx=to_idx)
    aligner.optimize(n_trials=500)
    aligned_embedding = aligner.get_optimal_rotation()
    return pd.DataFrame(data=aligned_embedding, columns = dataset.columns)

In [8]:
# we align all the datasets relative to the pillar 2 output. It shouldn't make a difference
sgss = align(sgss, sgss_symptoms)
css = align(css, css_symptoms)
cis = align(cis, cis_symptoms)

[32m[I 2021-08-18 10:27:07,149][0m A new study created in memory with name: no-name-633d6ca9-1ef3-4172-87de-84f8bd5901be[0m
[32m[I 2021-08-18 10:27:07,165][0m Trial 0 finished with value: 108.21599535282306 and parameters: {'theta': 231.71169343537449, 'x_shift': 7.875632725982722, 'y_shift': -5.088079514387697}. Best is trial 0 with value: 108.21599535282306.[0m
[32m[I 2021-08-18 10:27:07,168][0m Trial 1 finished with value: 54.009588744883885 and parameters: {'theta': 13.815812818549084, 'x_shift': 6.227629488171186, 'y_shift': -6.398231297844578}. Best is trial 1 with value: 54.009588744883885.[0m
[32m[I 2021-08-18 10:27:07,171][0m Trial 2 finished with value: 142.3436896446308 and parameters: {'theta': 244.07855253461122, 'x_shift': -7.213628313952835, 'y_shift': -7.687943864519156}. Best is trial 1 with value: 54.009588744883885.[0m
[32m[I 2021-08-18 10:27:07,173][0m Trial 3 finished with value: 82.2782088485873 and parameters: {'theta': 331.2173561047863, 'x_shift':

In [14]:
p2.to_csv('Data/Alignments/ProcrustesAlignments/p2_loose.csv')
sgss.to_csv('Data/Alignments/ProcrustesAlignments/sgss_loose.csv')
css.to_csv('Data/Alignments/ProcrustesAlignments/css_loose.csv')
cis.to_csv('Data/Alignments/ProcrustesAlignments/cis_loose.csv')