In [2]:
import numpy as np
import pandas as pd
from dotenv import load_dotenv

from phmlondon.snow_utils import SnowflakeConnection

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 100)

In [3]:
load_dotenv()

snowsesh = SnowflakeConnection()
snowsesh.use_database("INTELLIGENCE_DEV")
snowsesh.use_schema("AI_CENTRE_FEATURE_STORE")

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://login.microsoftonline.com/8076439c-5b1f-4e15-91ea-a0bff0b3bf16/saml2?SAMLRequest=nZJRb9owFIX%2FSuQ9x7EDYWABFSuwIrEVFZimvTmJAxaOnfo6TemvnxNA6h7ah71Z9rn3O77nju9eSxW8CAvS6AmimKBA6MzkUh8maL9bhkMUgOM658poMUFnAehuOgZeqorNanfUT%2BK5FuAC30gDax8mqLaaGQ4SmOalAOYytp39WLMYE8YBhHUeh64lOUjPOjpXsShqmgY3PWzsIYoJIREZRV7VSr6gd4jqc0ZljTOZUbeSV%2F%2BnDxA0Iv0W4RWesLkWfpP6MoLPKOlFBOxht9uEm8ftDgWz2%2B%2FujYa6FHYr7IvMxP5pfTEA3sF2kQzocIDrUwjGzxDzt9oKDNo0heInkZmyqp1vjf0pKkQeKXOQfmCr%2BQRVJ5m%2FmSLZx%2B7At8VqAYtysPydPnJLdXPcr4ecP8zP8bOyffr9kKHg1y3euI13BVCLlW5Ddf6KxElI4pCSXRwzQlhCcJz0%2F6Bg7kOVmruu8ua884FLmVkDpnBGK6lF53JIvg76vVEWJiktwr6gSTiigoecpEVB0l5a0EHURhejy%2Fqwzoid%2FtdQxtH7Ftd1%2FOkTWs03RsnsHCyNLbn7OECKaXcj87DopEyUXKpZnlsB4INUyjT3VnDnt97ZWqBoeqH%2Bu%2FfT

# PPMI 2D / 3D

In [4]:
def get_phenotype_data(snowsesh, lower_age, upper_age):
    """
    Get phenotype data for patients in age range from Snowflake.
    """
    query = f"""
    WITH age_calc AS (
        SELECT *,
               DATEDIFF(year, DATE_OF_BIRTH, CURRENT_DATE()) as age
        FROM INTELLIGENCE_DEV.AI_CENTRE_FEATURE_STORE.PERSON_5YEAR_PHENOTYPE
    )
    SELECT
        PERSON_ID,
        ASTHMA, COPD, DIABETES_TYPE2, DIABETES_TYPE1,
        HYPERTENSION, CORONARY_HEART_DISEASE, MYOCARDIAL_INFARCTION,
        TIA, STROKE, CKD_STAGE1, CKD_STAGE3, DEPRESSION,
        SEVERE_MENTAL_ILLNESS, CANCER, DEMENTIA, ATRIAL_FIBRILLATION,
        PALLIATIVE_CARE, HEART_FAILURE
    FROM age_calc
    WHERE age >= {lower_age} AND age <= {upper_age}
    """

    return snowsesh.execute_query_to_df(query)

In [12]:
lower_age = 65
upper_age = 84

df = get_phenotype_data(snowsesh, lower_age, upper_age)
phenotype_cols = [col for col in df.columns if col != "PERSON_ID"]
n_phenotypes = len(phenotype_cols)

In [13]:
# 2D PPMI

# empty co-occurrence matrix
cooccurrence_matrix = np.zeros((n_phenotypes, n_phenotypes))

# calculate co-occurrence
for i, pheno1 in enumerate(phenotype_cols):
    for j, pheno2 in enumerate(phenotype_cols):
        cooccurrence_matrix[i, j] = ((df[pheno1] == 1) & (df[pheno2] == 1)).sum()

# to dataframe
cooccurrence_df = pd.DataFrame(cooccurrence_matrix, index=phenotype_cols, columns=phenotype_cols)

# ppmi - individual probabilities
total_patients = len(df)
individual_probs = {col: (df[col] == 1).sum() / total_patients for col in phenotype_cols}

# ppmi - matrix
ppmi_matrix = np.zeros((n_phenotypes, n_phenotypes))
for i, pheno1 in enumerate(phenotype_cols):
    for j, pheno2 in enumerate(phenotype_cols):
        # joint probability
        p_xy = cooccurrence_matrix[i, j] / total_patients
        # individual probabilities
        p_x = individual_probs[pheno1]
        p_y = individual_probs[pheno2]

        # calculate ppmi (epsilon smoothing to avoid log(0))
        epsilon = 1e-10
        pmi = np.log((p_xy + epsilon) / ((p_x * p_y) + epsilon))
        ppmi_matrix[i, j] = max(0, pmi)

# PPMI df
ppmi_df = pd.DataFrame(ppmi_matrix, index=phenotype_cols, columns=phenotype_cols)

ppmi_df.to_csv("ppmi_2d_age65-84.csv")

# print top pairs
print("Top phenotype associations (by PPMI):")
upper_tri = np.triu(ppmi_matrix, k=1)
indices = np.unravel_index(np.argsort(upper_tri.ravel())[-20:], upper_tri.shape)
for idx in zip(*indices, strict=False):
    print(f"{phenotype_cols[idx[0]]} - {phenotype_cols[idx[1]]}: {ppmi_matrix[idx]:.2f}")

Top phenotype associations (by PPMI):
TIA - ATRIAL_FIBRILLATION: 0.56
TIA - DEMENTIA: 0.59
MYOCARDIAL_INFARCTION - ATRIAL_FIBRILLATION: 0.60
STROKE - HEART_FAILURE: 0.61
CKD_STAGE3 - HEART_FAILURE: 0.61
COPD - PALLIATIVE_CARE: 0.64
SEVERE_MENTAL_ILLNESS - PALLIATIVE_CARE: 0.66
SEVERE_MENTAL_ILLNESS - DEMENTIA: 0.78
STROKE - ATRIAL_FIBRILLATION: 0.88
PALLIATIVE_CARE - HEART_FAILURE: 0.90
DEPRESSION - SEVERE_MENTAL_ILLNESS: 0.97
CANCER - PALLIATIVE_CARE: 0.99
STROKE - PALLIATIVE_CARE: 1.02
CORONARY_HEART_DISEASE - HEART_FAILURE: 1.03
STROKE - DEMENTIA: 1.15
MYOCARDIAL_INFARCTION - HEART_FAILURE: 1.37
ATRIAL_FIBRILLATION - HEART_FAILURE: 1.50
TIA - STROKE: 1.63
DEMENTIA - PALLIATIVE_CARE: 2.01
CORONARY_HEART_DISEASE - MYOCARDIAL_INFARCTION: 2.03


In [14]:
# empty 3d co-occurrence matrix
cooccurrence_3d = np.zeros((n_phenotypes, n_phenotypes, n_phenotypes))

# calculate co-occurrences
for i, pheno1 in enumerate(phenotype_cols):
    for j, pheno2 in enumerate(phenotype_cols):
        for k, pheno3 in enumerate(phenotype_cols):
            # Count patients with all three conditions
            cooccurrence_3d[i, j, k] = (
                (df[pheno1] == 1) & (df[pheno2] == 1) & (df[pheno3] == 1)
            ).sum()

# ppmi - individual probabilities
total_patients = len(df)
individual_probs = {col: (df[col] == 1).sum() / total_patients for col in phenotype_cols}

# ppmi - pair probabilities (will need for expected probabilities)
pair_probs = {}
for i, pheno1 in enumerate(phenotype_cols):
    for j, pheno2 in enumerate(phenotype_cols):
        pair_key = (pheno1, pheno2)
        pair_probs[pair_key] = ((df[pheno1] == 1) & (df[pheno2] == 1)).sum() / total_patients

# 3D array for ppmi
ppmi_3d = np.zeros((n_phenotypes, n_phenotypes, n_phenotypes))
epsilon = 1e-10

for i, pheno1 in enumerate(phenotype_cols):
    for j, pheno2 in enumerate(phenotype_cols):
        for k, pheno3 in enumerate(phenotype_cols):
            # joint probability
            p_xyz = cooccurrence_3d[i, j, k] / total_patients

            # individual probabilities
            p_x = individual_probs[pheno1]
            p_y = individual_probs[pheno2]
            p_z = individual_probs[pheno3]

            # expected probability
            expected_prob = p_x * p_y * p_z

            # Calculate PPMI
            pmi = np.log((p_xyz + epsilon) / (expected_prob + epsilon))
            ppmi_3d[i, j, k] = max(0, pmi)


# top unique combinations
top_relationships = []

for i in range(n_phenotypes):
    for j in range(i + 1, n_phenotypes):
        for k in range(j + 1, n_phenotypes):
            score = ppmi_3d[i, j, k]
            if score > 0:  # Only include positive associations
                top_relationships.append(
                    {
                        "conditions": (phenotype_cols[i], phenotype_cols[j], phenotype_cols[k]),
                        "ppmi": score,
                        "count": cooccurrence_3d[i, j, k],
                    }
                )
top_relationships.sort(key=lambda x: x["ppmi"], reverse=True)

# PPMI df
relationships_data = []
for i in range(n_phenotypes):
    for j in range(n_phenotypes):
        for k in range(n_phenotypes):
            relationships_data.append(
                {
                    "condition1": phenotype_cols[i],
                    "condition2": phenotype_cols[j],
                    "condition3": phenotype_cols[k],
                    "ppmi": ppmi_3d[i, j, k],
                    "count": cooccurrence_3d[i, j, k],
                }
            )
relationships_df = pd.DataFrame(relationships_data)
relationships_df.to_csv("ppmi_3d_age65-84.csv", index=False)

# top relationships
print("Top 3D phenotype associations (by PPMI):")
for rel in top_relationships[:20]:
    print(f"{rel['conditions']}: PPMI={rel['ppmi']:.2f}, Count={rel['count']:.0f}")

Top 3D phenotype associations (by PPMI):
('STROKE', 'DEMENTIA', 'PALLIATIVE_CARE'): PPMI=3.66, Count=37
('CORONARY_HEART_DISEASE', 'MYOCARDIAL_INFARCTION', 'HEART_FAILURE'): PPMI=3.39, Count=383
('SEVERE_MENTAL_ILLNESS', 'DEMENTIA', 'PALLIATIVE_CARE'): PPMI=3.01, Count=32
('TIA', 'STROKE', 'DEMENTIA'): PPMI=2.95, Count=17
('TIA', 'DEMENTIA', 'PALLIATIVE_CARE'): PPMI=2.92, Count=24
('DIABETES_TYPE1', 'DEMENTIA', 'PALLIATIVE_CARE'): PPMI=2.79, Count=6
('CORONARY_HEART_DISEASE', 'MYOCARDIAL_INFARCTION', 'STROKE'): PPMI=2.79, Count=51
('CORONARY_HEART_DISEASE', 'MYOCARDIAL_INFARCTION', 'ATRIAL_FIBRILLATION'): PPMI=2.67, Count=253
('CORONARY_HEART_DISEASE', 'MYOCARDIAL_INFARCTION', 'TIA'): PPMI=2.59, Count=57
('TIA', 'STROKE', 'ATRIAL_FIBRILLATION'): PPMI=2.57, Count=27
('MYOCARDIAL_INFARCTION', 'ATRIAL_FIBRILLATION', 'HEART_FAILURE'): PPMI=2.55, Count=202
('DEMENTIA', 'ATRIAL_FIBRILLATION', 'PALLIATIVE_CARE'): PPMI=2.55, Count=68
('CORONARY_HEART_DISEASE', 'MYOCARDIAL_INFARCTION', 'PALLIAT