Spark initialization and getting access to dataset

In [None]:
import pyspark
import dxpy
import dxdata
!which java

In [None]:
# spark initialization
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

# find dispensed database name and dataset id
dispensed_database = dxpy.find_one_data_object(
    classname='database', 
    name='app*', 
    folder='/', 
    name_mode='glob', 
    describe=True)
dispensed_database_name = dispensed_database['describe']['name']

dispensed_dataset = dxpy.find_one_data_object(
    typename='Dataset', 
    name='app*.dataset', 
    folder='/', 
    name_mode='glob')
dispensed_dataset_id = dispensed_dataset['id']

dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset['participant']

Some helper functions to get field IDs

In [None]:
def fields_for_id(field_id):
    from distutils.version import LooseVersion
    field_id = str(field_id)
    fields = participant.find_fields(name_regex=r'^p{}(_i\d+)?(_a\d+)?$'.format(field_id))
    return sorted(fields, key=lambda f: LooseVersion(f.name))

def field_names_for_id(field_id):
    return [f.name for f in fields_for_id(field_id)]

def fields_by_title_keyword(keyword):
    from distutils.version import LooseVersion
    fields = list(participant.find_fields(lambda f: keyword.lower() in f.title.lower()))
    return sorted(fields, key=lambda f: LooseVersion(f.name))

def field_names_by_title_keyword(keyword):
    return [f.name for f in fields_by_title_keyword(keyword)]

def field_titles_by_title_keyword(keyword):
    return [f.title for f in fields_by_title_keyword(keyword)]

get selected field IDs from dataset

In [None]:
match_variables = [
                    'sex_f31_0_0', 'age_when_attended_assessment_centre_f21003_0_0',
                    'uk_biobank_assessment_centre_f54_2_0', 'uk_biobank_assessment_centre_f54_3_0',
                    'loud_music_exposure_frequency_f4836_0_0',
                    'noisy_workplace_f4825_0_0', 'cochlear_implant_f4792_0_0',
                    'hearing_aid_user_f3393_0_0', 'hearing_difficultyproblems_with_background_noise_f2257_0_0',
                    'handedness_chiralitylaterality_f1707_0_0', 'alcohol_intake_frequency_f1558_0_0',
                    'average_total_household_income_before_tax_f738_0_0',
                    'speechreceptionthreshold_srt_estimate_left_f20019_0_0',
                    'speechreceptionthreshold_srt_estimate_right_f20021_0_0'
                    ]

tinnitus_field_ids = ['p4803_i0', 'p4803_i2', 'p4814_i0', 'p28625'] # added extra tinnitus

mr_variables = ['p26501_i2', 'p26501_i3'] # to see if they exist

demographic_field_ids = [
                        'eid',
                        'p31', 'p21003_i0',
                        'p54_i2', 'p54_i3',
                        'p4836_i0',
                        'p4825_i0', 'p4792_i0',
                        'p3393_i0', 'p2257_i0',
                        'p1707_i0', 'p1558_i0',
                        'p738_i0',
                        'p20019_i0', 'p20021_i0'
                        ] + \
                        tinnitus_field_ids + \
                        mr_variables

df = participant.retrieve_fields(
        names=demographic_field_ids,
        engine=dxdata.connect()
    )
df.toPandas().to_csv('ukb_demographics.csv', index=False)
!dx upload ukb_demographics.csv --dest / 

Assuming you have the ukb_demographics.csv, let's prepare columns to be ready for matching

In [None]:
import re
import pandas as pd

## read file
df = pd.read_csv("ukb_demographics.csv")

## map column names
match_variables = [
                    'sex_f31_0_0', 'age_when_attended_assessment_centre_f21003_0_0',
                    'uk_biobank_assessment_centre_f54_2_0', 'uk_biobank_assessment_centre_f54_3_0',
                    'loud_music_exposure_frequency_f4836_0_0',
                    'noisy_workplace_f4825_0_0', 'cochlear_implant_f4792_0_0',
                    'hearing_aid_user_f3393_0_0', 'hearing_difficultyproblems_with_background_noise_f2257_0_0',
                    'handedness_chiralitylaterality_f1707_0_0', 'alcohol_intake_frequency_f1558_0_0',
                    'average_total_household_income_before_tax_f738_0_0',
                    'speechreceptionthreshold_srt_estimate_left_f20019_0_0',
                    'speechreceptionthreshold_srt_estimate_right_f20021_0_0'
                    ]
tinnitus_field_names = ['tin_status_1', 'tin_status_2', 'tin_severity', 'tin_duration']
new_col_names = ["subject_id", "sex", "age", "center_v2", "center_v3"] + \
            [re.split(r'_f\d+', s)[0] for s in match_variables[4:]] + \
            tinnitus_field_names + \
            ["Mean_intensity_v2", "Mean_intensity_v3"]


mapping = dict(zip(
                list(df.columns),
                new_col_names
                ))
df.rename(columns=mapping, inplace=True)

## fixing tinnitus status
cols = ["tin_status_1", "tin_status_2"]
mapping = {0: 0, 11: 1}
df[cols] = df[cols].map(mapping)
df = df.dropna(subset=cols, how="all")
df = df[
    (df["tin_status_1"] == df["tin_status_2"]) |
    (df["tin_status_1"].isna()) |
    (df["tin_status_2"].isna())
]

## HL fix
col = 'hearing_difficultyproblems_with_background_noise'
df[col] = df[col].map({0: 0, 1: 1})
df.dropna(subset=[col], inplace=True)

## helper function to clean the df
def replace_and_fill_mode(df, column, to_replace=None):
    """
    Replace specific values with NaN and fill missing values with the mode.
    
    Parameters:
        df: pandas DataFrame
        column: column name as string
        to_replace: list of values to replace with NaN (default ['Prefer not to answer', 'Do not know'])
    """
    if to_replace is None:
        to_replace = ['Prefer not to answer', 'Do not know']
    
    df[column].replace(to_replace, pd.NA, inplace=True)
    df[column].fillna(df[column].mode()[0], inplace=True)

# apply the helper function on these columns
replace_fill_cols = {
    'handedness_chiralitylaterality': ['Prefer not to answer'],
    'average_total_household_income_before_tax': ['Prefer not to answer', 'Do not know'],
    'loud_music_exposure_frequency': ['Prefer not to answer', 'Do not know'],
    'noisy_workplace': ['Prefer not to answer', 'Do not know'],
    'cochlear_implant': ['Prefer not to answer'],
    'hearing_aid_user': ['Prefer not to answer'],
    'hearing_difficultyproblems_with_background_noise': ['Prefer not to answer', 'Do not know'],
    'alcohol_intake_frequency': ['Prefer not to answer']
}

for col, values in replace_fill_cols.items():
    replace_and_fill_mode(df, col, to_replace=values)

# drop rows with missing critical data
df.dropna(subset=[
    'speechreceptionthreshold_srt_estimate_left',
    'speechreceptionthreshold_srt_estimate_right'
    ], how='any', inplace=True)

## get only subjects that have MRI at least in one visit
df.dropna(subset=["Mean_intensity_v2", "Mean_intensity_v3"], how='all', inplace=True)

## save
df.to_csv("ukb_demographics_ready_for_matching.csv", index=False)
!dx upload ukb_demographics_ready_for_matching.csv --dest / 

Matching in R

In [None]:
## plotting distribution
%matplotlib qt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df_matched = pd.read_csv("../data/ukb_demographics_matched.csv")
df_plot = df_matched[["age", "sex", "tin_status"]]

df_plot["sex"] = df_plot["sex"].map({0: "Female", 1: "Male"})
df_plot["tin_status"] = df_plot["tin_status"].map({0: "Control", 1: "Tinnitus"})

pal = [
        sns.cubehelix_palette(3, rot=-.2, light=.7).as_hex()[1],
        sns.color_palette("ch:s=-.2,r=.3", as_cmap=False).as_hex()[2]
]

xlim = [5, 80]
bw_adjust = 1
hues = ["tin_status", "sex"]
pals = []
for hue in hues:
        g = sns.FacetGrid(
                df_plot, hue=hue, aspect=3.5, height=1.6,
                palette=pal, xlim=xlim
        )

        g.map(sns.kdeplot, "age", bw_adjust=bw_adjust, clip_on=False, clip=xlim,
                fill=True, alpha=0.6, linewidth=1.5)
        g.map(sns.kdeplot, "age", clip_on=False, color="w", clip=xlim,
                lw=1.5, bw_adjust=bw_adjust)
        g.refline(y=0, linewidth=2, linestyle="-", color=None, clip_on=False)

        g.figure.subplots_adjust(hspace=.15, top=0.72)
        g.set_titles("")
        g.add_legend(title="")
        g.set(yticks=[], ylabel="", xlabel=r"age")
        g.despine(bottom=True, left=True)
        # g.figure.savefig(saving_dir / f"{hue}_distribution.pdf", 
        #                 format="pdf",       
        #                 dpi=300,            
        #                 bbox_inches="tight"
        #                 )

Work on matched dataframe and get full MRI data from UKB

In [None]:
import pandas as pd

## load the data
df_matched = pd.read_csv("ukb_demographics_matched.csv")
subject_ids = df_matched["subject_id"].values.tolist()
ids_sql = ",".join(map(str, subject_ids))

def get_field_ids(keyword):
    field_ids = [
                    str(
                    fields_by_title_keyword(fn)[0]
                    ).split('"')[1]
                    for fn in field_titles_by_title_keyword(keyword)
                    ]
    
    return field_ids

vol_field_ids = ['eid'] + get_field_ids('Volume of')
thickness_field_ids = ['eid'] + get_field_ids('Mean thickness of')
area_field_ids = ['eid'] + get_field_ids('Area of')

print(f"********* getting volume information ***********")
df = participant.retrieve_fields(
    names=vol_field_ids,
    engine=dxdata.connect()
)

df_subset = df.filter(f"eid IN ({ids_sql})")
df_subset.toPandas().to_csv('ukb_vol.csv', index=False)
!dx upload ukb_vol.csv --dest / 

print(f"********* getting thickness information ***********")
df = participant.retrieve_fields(
    names=thickness_field_ids,
    engine=dxdata.connect()
)

df_subset = df.filter(f"eid IN ({ids_sql})")
df_subset.toPandas().to_csv('ukb_thickness.csv', index=False)
!dx upload ukb_thickness.csv --dest / 

print(f"********* getting area information ***********")
df = participant.retrieve_fields(
    names=area_field_ids,
    engine=dxdata.connect()
)

df_subset = df.filter(f"eid IN ({ids_sql})")
df_subset.toPandas().to_csv('ukb_area.csv', index=False)
!dx upload ukb_area.csv --dest / 


Assuming you have UKB matched file and now run harmonization

In [None]:
import pandas as pd
from neuroHarmonize import harmonizationLearn, harmonizationApply

In [None]:
## read the dataframes
df_covar = pd.read_csv("../data/ukb_demographics_matched.csv")
df_features = pd.read_csv("../data/ukb_vol.csv")
df_map = pd.read_html("../data/fs_derivatives.html")[1]

## organize df_map
df_map = df_map.iloc[1:].reset_index(drop=True)
df_map["UDI"] = (
                    "p"
                    + df_map["UDI"].astype(str)
                    .str.replace("-", "_i", regex=False)
                    .str.replace(r"\.0$", "", regex=True)
                )
df_map = df_map[["UDI", "Description"]]

## keep necessary columns from features
feature_cols = df_map["UDI"].tolist()
vol_features = ["eid"] + [f for f in feature_cols if f in df_features.columns]
df_features = df_features[vol_features]

## clean df_covars
srt_cols = [
            "speechreceptionthreshold_srt_estimate_left",
            "speechreceptionthreshold_srt_estimate_right"
            ]
df_covar["srt"] = df_covar[srt_cols].sum(axis=1)
df_covar.drop(columns=srt_cols, inplace=True)

covar_cols = ["subject_id",	"sex", "age", "center_v2", "center_v3", "tin_status", "tin_severity", "tin_duration", "srt"]
df_covar = df_covar[covar_cols]
df_covar["SITE"] = df_covar["center_v2"].combine_first(df_covar["center_v3"])
df_covar.drop(columns=["center_v2", "center_v3"], inplace=True)

## merge covar with features
df_features.rename(columns={"eid" : "subject_id"}, inplace=True)
df_covar = df_covar.merge(df_features, on="subject_id", how="left")
i2_cols = [c for c in df_covar.columns if c.endswith("_i2")]

for c in i2_cols:
    base = c[:-3]
    c_i2 = c
    c_i3 = base + "_i3"

    df_covar[base] = df_covar[c_i2].combine_first(df_covar[c_i3])

df = df_covar.drop(columns=[c for c in df_covar.columns if c.endswith(("_i2", "_i3"))])

In [None]:
## get controls and create matrixes
df_subjects = df[["subject_id", "tin_status", "tin_severity", "tin_duration"]]
df_covars = df[["age", "sex", "srt", "SITE"]]
feature_cols = [c for c in df.columns if c.startswith("p")]
data_matrix = df[feature_cols].to_numpy()

df_controls = df.query('tin_status == 0')
controls_matrix = df_controls.filter(regex="^p").to_numpy()
controls_covars = df_controls[["age", "sex", "srt", "SITE"]]

## run harmonizing and get model and apply on all
hm_model, _ = harmonizationLearn(controls_matrix, controls_covars)
my_data_adj = harmonizationApply(data_matrix, df_covars, hm_model)

## return back to dataframe
df_hm = pd.concat([
                        df_subjects,
                        df_covars,
                        pd.DataFrame(my_data_adj, columns=feature_cols)
                        ],
                        axis=1
                        )
df_hm.to_csv("../data/ukb_vol_harmonized.csv")

Statistical comparison with different atlases

In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests

In [None]:
## read harmonized df
df_hm = pd.read_csv('../data/ukb_vol_harmonized.csv')
atlas_name = "amygdalar_nuclei"
covariates = ["age", "sex", "srt"]  
correction = "bonferroni"
alpha = 0.05

Define FS ukb ids and atlases

In [None]:
## ranges from UKB documentation
FS_RANGES = {
    "aparc_volume": {
        "lh": (27205, 27235),
        "rh": (27298, 27328),
    },
    "aparc_2009_volume": {
        "lh": (27477, 27550),
        "rh": (27699, 27772),
    },
    "amygdalar_nuclei": {
        "lh": (26600, 26609),
        "rh": (26610, 26619),
    },
    "brainstem": {
        "both": (26716, 26720),
    },
    "hippo_subfields": {
        "lh": (26620, 26641),
        "rh": (26642, 26663),
    },
    "aseg": {
        "lh": (26554, 26567),
        "rh": (26585, 26598),
    },
    "thalamic_nuclei": {
        "lh": (26664, 26687),
        "rh": (26688, 26715),
    },
    "aparc_thickness": {
        "lh": (26756, 26788),
        "rh": (26857, 26889),
    },
    "aparc_area": {
        "lh": (26722, 26754),
        "rh": (26823, 26855),
    },
    "aparc_2009s_thickness": {
        "lh": (27403, 27476),
        "rh": (27625, 27698),
    },
    "aparc_2009s_area": {
        "lh": (27329, 27402),
        "rh": (27625, 27698),
    },
}

ATLAS_RULES = {
    "thalamic_nuclei": {
        "sum": {
            "Volume_of_Whole_thalamus": [
                "Volume_of_Whole_thalamus_left_hemisphere",
                "Volume_of_Whole_thalamus_right_hemisphere",
            ]
        }
    },
    "amygdalar_nuclei": {
        "sum": {
            "Volume_of_Whole_amygdala": [
                "Volume_of_Whole_amygdala_left_hemisphere",
                "Volume_of_Whole_amygdala_right_hemisphere",
            ]
        }
    },
    "hippo_subfields": {
        "sum": {
            "Volume_of_Whole_hippocampus": [
                "Volume_of_Whole_hippocampus_left_hemisphere",
                "Volume_of_Whole_hippocampus_right_hemisphere",
            ]
        },
        "drop": [
            "Volume_of_Whole_hippocampal_body_left_hemisphere",
            "Volume_of_Whole_hippocampal_head_left_hemisphere",
            "Volume_of_Hippocampal_tail_left_hemisphere",
            "Volume_of_Whole_hippocampal_body_right_hemisphere",
            "Volume_of_Whole_hippocampal_head_right_hemisphere",
            "Volume_of_Hippocampal_tail_right_hemisphere",
        ],
    },
}

In [None]:
## define a function to select necessary columns
def pcols_from_ranges(ranges):
    cols = []
    for r in ranges.values():
        cols.extend(range(r[0], r[1] + 1))
    return [f"p{i}" for i in cols]

common_cols = list(df_hm.columns[:7])

## get the atlas
sel_cols = pcols_from_ranges(FS_RANGES[atlas_name])
df_atlas = df_hm[
                    common_cols + sel_cols
                        ]

## map names in the UKB mapping to atlas columns
df_map = pd.read_html("../data/fs_derivatives.html")[1]
df_map = df_map.iloc[1:].reset_index(drop=True)
df_map["UDI"] = (
                    "p"
                    + df_map["UDI"].astype(str)
                    .str.replace("-", "_i", regex=False)
                    .str.replace(r"\.0$", "", regex=True)
                )
df_map = df_map[["UDI", "Description"]]
df_map = df_map[df_map["UDI"].str.endswith("_i2", na=False)].copy()
df_map["UDI"] = df_map["UDI"].str.replace("_i2$", "", regex=True)
rename_map = dict(zip(df_map["UDI"], df_map["Description"]))
df_atlas.rename(columns=rename_map, inplace=True)
df_atlas.columns = (
                    df_atlas.columns
                    .str.replace(r"[ \-\+]", "_", regex=True)
                    .str.replace(r"[()]", "", regex=True)
                    )

## some name adjustments for special atlases
rules = ATLAS_RULES.get(atlas_name, {})

for new_col, cols in rules.get("sum", {}).items():
    df_atlas[new_col] = df_atlas[cols].sum(axis=1)
    df_atlas.drop(columns=cols, inplace=True)

if "drop" in rules:
    df_atlas.drop(columns=rules["drop"], inplace=True)

end = None if atlas_name == "aparc_2009_volume" else -1
bl_cols = list(df_atlas.columns[7:end])

In [None]:
## function to run ANOVA + multiple comparison
def mass_ancova(
    df,
    outcome_cols,
    group_col,
    covariates,
    correction="fdr_bh",
    alpha=0.05,
):
    results = []

    for col in outcome_cols:
        formula = f"{col} ~ {group_col} + " + " + ".join(covariates)

        try:
            model = smf.ols(formula, data=df).fit()

            beta = model.params[group_col]
            pval = model.pvalues[group_col]
            tval = model.tvalues[group_col]

            n1 = (df[group_col] == 1).sum()
            n0 = (df[group_col] == 0).sum()

            mean_tinnitus = df.loc[df[group_col] == 1, col].mean()
            mean_control = df.loc[df[group_col] == 0, col].mean()

            cohen_d = tval * np.sqrt(1 / n1 + 1 / n0)

            results.append({
                "brain_label": col,
                "beta": beta,
                "t": tval,
                "pval": pval,
                "cohen_d": cohen_d,
                "n_tinnitus": n1,
                "n_control": n0,
                "mean_tinnitus": mean_tinnitus,
                "mean_control": mean_control,
            })

        except Exception as e:
            results.append({
                "brain_label": col,
                "beta": np.nan,
                "t": np.nan,
                "pval": np.nan,
                "cohen_d": np.nan,
                "n_tinnitus": np.nan,
                "n_control": np.nan,
                "mean_tinnitus": np.nan,
                "mean_control": np.nan,
                "error": str(e),
            })

    res = pd.DataFrame(results)

    # multiple testing correction
    mask = res["pval"].notna()

    _, pval_adj, _, _ = multipletests(
        res.loc[mask, "pval"],
        alpha=alpha,
        method=correction,
    )

    res.loc[mask, "pval_adj"] = pval_adj
    res["significant"] = res["pval_adj"] < alpha

    return res

df_results = mass_ancova(
                        df_atlas,
                        bl_cols,
                        group_col="tin_status",
                        covariates=covariates,
                        correction=correction,
                        alpha=0.05,
                    )
df_results.sort_values(by="pval", inplace=True)
df_sig = df_results.query('significant == True')
df_atlas["tin_status"] = df_atlas["tin_status"].map({1: "Tinnitus", 0: "Control"})

Plotting significant results

In [None]:
%matplotlib qt
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import pearsonr

In [None]:
def plot_sig_rois(df, roi):

    pal = ['#1f77b4', '#d62728']
    order = ["Control", "Tinnitus"]

    fig, ax = plt.subplots(1, 1, figsize=(6, 2.3), layout="tight")

    sns.boxplot(
        data=df,
        y="tin_status",
        x=roi,
        palette=pal,
        width=0.5,
        linewidth=1.8,
        fill=False,
        order=order,
        showfliers=False,
        ax=ax
    )

    sns.stripplot(
        data=df,
        y="tin_status",
        x=roi,
        palette=pal,
        linewidth=0,
        size=3.5,
        edgecolor=None,
        jitter=0.25,
        alpha=0.1,
        order=order,
        ax=ax
    )

    # Clean axes
    ax.spines[['top', 'left', 'right']].set_visible(False)
    ax.set(ylabel="", xlabel="", yticks=[], title=roi)

    handles = [
        Patch(facecolor='none', edgecolor=pal[i], linewidth=2, label=order[i])
        for i in range(len(pal))
    ]
    ax.legend(
        handles=handles,
        title="Group",
        loc='upper left',
        bbox_to_anchor=(1.001, 0.99),
        borderaxespad=0,
        frameon=False
    )

rois = df_sig['brain_label'].values.tolist()
for roi in rois:
    plot_sig_rois(df_atlas, roi)

Plotting significant correlations (severity + duration)

In [None]:
mode = "tin_duration"
covariates = ("age", "sex", "srt")
correction = "fdr_bh"

In [None]:
def mass_partial_corr(
    df,
    mode,
    feature_cols=None,
    covariates=("age", "sex", "srt"),
    correction="fdr_bh",
    alpha=0.05,
):
    results = []

    X = sm.add_constant(df[list(covariates)])
    y = df[mode]
    sev_resid = sm.OLS(y, X, missing="drop").fit().resid

    for col in feature_cols:
        tmp = df[[col] + list(covariates)].dropna()

        if tmp.shape[0] < 10:
            results.append({
                "feature": col,
                "r": np.nan,
                "pval": np.nan,
                "n": tmp.shape[0],
            })
            continue

        X_tmp = sm.add_constant(tmp[list(covariates)])
        feat_resid = sm.OLS(tmp[col], X_tmp).fit().resid

        r, p = pearsonr(sev_resid.loc[tmp.index], feat_resid)

        results.append({
            "feature": col,
            "r": r,
            "pval": p,
            "n": tmp.shape[0],
        })

    res = pd.DataFrame(results)

    # multiple comparison correction
    mask = res["pval"].notna()
    _, p_adj, _, _ = multipletests(
        res.loc[mask, "pval"],
        method=correction,
        alpha=alpha
    )

    res.loc[mask, "pval_adj"] = p_adj
    res["significant"] = res["pval_adj"] < alpha

    return res.sort_values("pval_adj")


df_corr = df_atlas.query('tin_status == "Tinnitus"')
df_tis = df_corr.dropna(subset=["tin_severity"], how="any")
df_tid = df_corr.dropna(subset=["tin_duration"], how="any")

mode = "tin_duration"
if mode == "tin_severity":
    df = df_tis.copy()
elif mode == "tin_duration":
    df = df_tid.copy()
else:
    raise ValueError("wrong mode.")

feature_cols = list(df.columns[7:])
df_corr_results = mass_partial_corr(
    df,
    mode=mode,
    feature_cols=feature_cols,
    covariates=covariates,
    correction=correction
)
df_corr_results = df_corr_results.query('significant == True')

In [None]:
df_corr_results

In [None]:
# check subjects who changed tinnitus state
# maybe try adding TIV 
# thickness and area