In [1]:
from dotenv import load_dotenv
from phmlondon.snow_utils import SnowflakeConnection
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import streamlit as st
import pandas as pd
import altair as alt

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

In [2]:
load_dotenv()

snowsesh = SnowflakeConnection()
snowsesh.use_database("INTELLIGENCE_DEV")
snowsesh.use_schema("AI_CENTRE_FEATURE_STORE")

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://login.microsoftonline.com/8076439c-5b1f-4e15-91ea-a0bff0b3bf16/saml2?SAMLRequest=nZJBc9owEIX%2Fikc925YNplgDZEjcpMzQQrGTTrjJ9ho02JIjyRj66ysbmEkPyaE3jfR2v6d9O7k7VaV1BKmY4FPkORhZwDORM76boufk0R4jS2nKc1oKDlN0BoXuZhNFq7Im80bv%2BQbeGlDaMo24It3DFDWSE0EVU4TTChTRGYnnP5bEdzChSoHUBoeuJblihrXXuiau27at0w4cIXeujzF2cegaVSf5gt4h6s8ZtRRaZKK8lZzMnz5AeC4edgijMIT1tfCe8csIPqOkF5Ei35Nkba9XcYKs%2Be13D4KrpgIZgzyyDJ43y4sBZRzE34KRNx45zcFWwszQoX8aCY7ioi1KeoBMVHWjTWvHnNwCcrcUO2YGtoimqD6w%2FIUunuORjLO35eYp2m5f4fcxDO9fV6tzPqdPvxIql6d9lLJIZch6ucXrd%2FEulGpgwbtQtbnCfmBj38Zh4vtkMCTB2MGBt0VWZEJlnOq%2B8ua89%2BFULJNCiUILXjIOvcsx%2FjoaDsLMDlKvsIfgBXboAbUpTosCp4O08EZuF52PLutDeiNy9l9DmbjvW1zX8adJaBGtRcmys%2FUoZEX1xwF6jtffsNwueimBirJynucSlDJBlqVoHyRQbbZeywaQO7tQ%2F9372V8%3D

# PPMI 2D / 3D

In [3]:
def get_phenotype_data(snowsesh, lower_age, upper_age):
    """
    Get phenotype data for patients in age range from Snowflake.
    """
    query = f"""
    WITH age_calc AS (
        SELECT *,
               DATEDIFF(year, DATE_OF_BIRTH, CURRENT_DATE()) as age
        FROM INTELLIGENCE_DEV.AI_CENTRE_FEATURE_STORE.PERSON_5YEAR_PHENOTYPE
    )
    SELECT
        PERSON_ID,
        ASTHMA, COPD, DIABETES_TYPE2, DIABETES_TYPE1,
        HYPERTENSION, CORONARY_HEART_DISEASE, MYOCARDIAL_INFARCTION,
        TIA, STROKE, CKD_STAGE1, CKD_STAGE3, DEPRESSION,
        SEVERE_MENTAL_ILLNESS, CANCER, DEMENTIA, ATRIAL_FIBRILLATION,
        PALLIATIVE_CARE, HEART_FAILURE
    FROM age_calc
    WHERE age >= {lower_age} AND age <= {upper_age}
    """

    return snowsesh.execute_query_to_df(query)

In [4]:
lower_age = 18
upper_age = 79

df = get_phenotype_data(snowsesh, lower_age, upper_age)
phenotype_cols = [col for col in df.columns if col != 'PERSON_ID']
n_phenotypes = len(phenotype_cols)

Error executing query: (1304): 01ba494a-0000-79a3-0000-08090e4de3aa: 000904 (42000): SQL compilation error: error line 8 at position 22
invalid identifier 'DIABETES'


SnowparkSQLException: (1304): 01ba494a-0000-79a3-0000-08090e4de3aa: 000904 (42000): SQL compilation error: error line 8 at position 22
invalid identifier 'DIABETES'

In [None]:
# 2D PPMI

# empty co-occurrence matrix
cooccurrence_matrix = np.zeros((n_phenotypes, n_phenotypes))

# calculate co-occurrence
for i, pheno1 in enumerate(phenotype_cols):
    for j, pheno2 in enumerate(phenotype_cols):

        cooccurrence_matrix[i, j] = ((df[pheno1] == 1) & (df[pheno2] == 1)).sum()

# to dataframe
cooccurrence_df = pd.DataFrame(
    cooccurrence_matrix,
    index=phenotype_cols,
    columns=phenotype_cols
)

# ppmi - individual probabilities
total_patients = len(df)
individual_probs = {col: (df[col] == 1).sum() / total_patients for col in phenotype_cols}

# ppmi - matrix
ppmi_matrix = np.zeros((n_phenotypes, n_phenotypes))
for i, pheno1 in enumerate(phenotype_cols):
    for j, pheno2 in enumerate(phenotype_cols):
        # joint probability
        p_xy = cooccurrence_matrix[i, j] / total_patients
        # individual probabilities
        p_x = individual_probs[pheno1]
        p_y = individual_probs[pheno2]

        # calculate ppmi (epsilon smoothing to avoid log(0))
        epsilon = 1e-10
        pmi = np.log((p_xy + epsilon) / ((p_x * p_y) + epsilon))
        ppmi_matrix[i, j] = max(0, pmi)

# PPMI df
ppmi_df = pd.DataFrame(
    ppmi_matrix,
    index=phenotype_cols,
    columns=phenotype_cols
)

ppmi_df.to_csv('ppmi_2d.csv')

# print top pairs
print("Top phenotype associations (by PPMI):")
upper_tri = np.triu(ppmi_matrix, k=1)
indices = np.unravel_index(np.argsort(upper_tri.ravel())[-20:], upper_tri.shape)
for idx in zip(*indices):
    print(f"{phenotype_cols[idx[0]]} - {phenotype_cols[idx[1]]}: {ppmi_matrix[idx]:.2f}")

In [None]:
# empty 3d co-occurrence matrix
cooccurrence_3d = np.zeros((n_phenotypes, n_phenotypes, n_phenotypes))

# calculate co-occurrences
for i, pheno1 in enumerate(phenotype_cols):
    for j, pheno2 in enumerate(phenotype_cols):
        for k, pheno3 in enumerate(phenotype_cols):
            # Count patients with all three conditions
            cooccurrence_3d[i, j, k] = (
                (df[pheno1] == 1) &
                (df[pheno2] == 1) &
                (df[pheno3] == 1)
            ).sum()

# ppmi - individual probabilities
total_patients = len(df)
individual_probs = {col: (df[col] == 1).sum() / total_patients for col in phenotype_cols}

# ppmi - pair probabilities (will need for expected probabilities)
pair_probs = {}
for i, pheno1 in enumerate(phenotype_cols):
    for j, pheno2 in enumerate(phenotype_cols):
        pair_key = (pheno1, pheno2)
        pair_probs[pair_key] = ((df[pheno1] == 1) & (df[pheno2] == 1)).sum() / total_patients

# 3D array for ppmi
ppmi_3d = np.zeros((n_phenotypes, n_phenotypes, n_phenotypes))
epsilon = 1e-10

for i, pheno1 in enumerate(phenotype_cols):
    for j, pheno2 in enumerate(phenotype_cols):
        for k, pheno3 in enumerate(phenotype_cols):
            # joint probability
            p_xyz = cooccurrence_3d[i, j, k] / total_patients

            # individual probabilities
            p_x = individual_probs[pheno1]
            p_y = individual_probs[pheno2]
            p_z = individual_probs[pheno3]

            # expected probability
            expected_prob = p_x * p_y * p_z

            # Calculate PPMI
            pmi = np.log((p_xyz + epsilon) / (expected_prob + epsilon))
            ppmi_3d[i, j, k] = max(0, pmi)


# top unique combinations
top_relationships = []

for i in range(n_phenotypes):
    for j in range(i+1, n_phenotypes):
        for k in range(j+1, n_phenotypes):
            score = ppmi_3d[i, j, k]
            if score > 0:  # Only include positive associations
                top_relationships.append({
                    'conditions': (phenotype_cols[i], phenotype_cols[j], phenotype_cols[k]),
                    'ppmi': score,
                    'count': cooccurrence_3d[i, j, k]
                })
top_relationships.sort(key=lambda x: x['ppmi'], reverse=True)

# PPMI df
relationships_data = []
for i in range(n_phenotypes):
    for j in range(n_phenotypes):
        for k in range(n_phenotypes):
            relationships_data.append({
                'condition1': phenotype_cols[i],
                'condition2': phenotype_cols[j],
                'condition3': phenotype_cols[k],
                'ppmi': ppmi_3d[i, j, k],
                'count': cooccurrence_3d[i, j, k]
            })
relationships_df = pd.DataFrame(relationships_data)
relationships_df.to_csv('ppmi_3d.csv', index=False)

# top relationships
print("Top 3D phenotype associations (by PPMI):")
for rel in top_relationships[:20]:
    print(f"{rel['conditions']}: PPMI={rel['ppmi']:.2f}, Count={rel['count']:.0f}")

