In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Tabular data update
Based on comparing ukb46307, ukb49190, ukb50467, ukb51581, **ukb46307** still has the most subjects (year: 2020)

In [9]:
## Tabular Data
project_dir = "../"
data_dir = "/home/nikhil/projects/brain_changes/data/ukbb/"

ukbb_tab_version = "ukb49190" #older version: "ukb46307.csv"
ukbb_tab_csv = f"{data_dir}tabular/{ukbb_tab_version}.csv" 
ukbb_demograph_csv = f"{data_dir}tabular/{ukbb_tab_version}_demographics.csv" 
ukbb_ses2and3_demograph_csv = f"{data_dir}tabular/{ukbb_tab_version}_ses2and3_demographics.csv" 

# Neurohub list
neurohub_ses2_subject_ids = f"{data_dir}subject_ids/ukbb_brain-age_ses-2_subject_ids.txt"
neurohub_ses3_subject_ids = f"{data_dir}subject_ids/ukbb_brain-age_ses-3_subject_ids.txt"

# derivatives
ukbb_freesurfer_csv = f"{data_dir}derivatives/freesurfer_agg/{ukbb_tab_version}_DKT_aseg.csv"
ukbb_fmriprep_subject_ids = f"{data_dir}subject_ids/ukbb_fmriprep_subject_ids.txt"


## Select few columns (cannot read entire CSV in memory)

In [None]:
read_neurohub_manifest = True
save_ukbb_demograph = False
select_subjects_with_followup = True
save_ukbb_followup_subset = True

if read_neurohub_manifest:
    column_codes = {
        "eid":"eid", 
        "31-0.0":"sex",
        "34-0.0":"birth_year",
        "21022-0.0": "age_at_recruitment",
        "21003-2.0": "age_at_ses2",
        "21003-3.0": "age_at_ses3",
        "21000-0.0":"ethnicity", 
        "20252-2.0":"T1-ses2",
        "20252-3.0":"T1-ses3",
        "53-2.0":"imaging date-ses2",
        "53-3.0":"imaging date-ses3"}

    ukbb_tab_df = pd.read_csv(ukbb_tab_csv, index_col=["eid"], usecols=column_codes.keys())

    ukbb_tab_df = ukbb_tab_df.rename(columns=column_codes)

    if save_ukbb_demograph:
        ukbb_tab_df.to_csv(ukbb_demograph_csv)


    if select_subjects_with_followup:
        # Make sure subjects have imaging data from ses2 and ses3
        ukbb_tab_df = ukbb_tab_df[(~ukbb_tab_df["T1-ses2"].isna()) & (~ukbb_tab_df["T1-ses3"].isna())]

        print(f'Number of available follow up subjects: {len(ukbb_tab_df)}')

    ## Save ukbb follow-up data csv to save time reading entire tabular data csv 
    if save_ukbb_followup_subset: 
        ukbb_tab_df.to_csv(ukbb_ses2and3_demograph_csv)

else:
    ukbb_tab_followup_df = pd.read_csv(ukbb_ses2and3_demograph_csv)


In [16]:
neurohub_ses2_ids = pd.read_csv(neurohub_ses2_subject_ids, header=None)[0].values
neurohub_ses3_ids = pd.read_csv(neurohub_ses3_subject_ids, header=None)[0].values

neurohub_ses2and3_ids = set(neurohub_ses2_ids) & set(neurohub_ses3_ids)

print(f"Sample sizes, ses2: {len(set(neurohub_ses2_ids))}, ses3: {len(set(neurohub_ses3_ids))}, ses2and3: {len(neurohub_ses2and3_ids)}")

# Currently processed fmriprep subject ids:
fmriprep_ids = pd.read_csv(ukbb_fmriprep_subject_ids, header=None)[0].values
fmriprep_ids = [int(i.split("-")[1]) for i in fmriprep_ids]
ids_to_be_processed = neurohub_ses2and3_ids - set(fmriprep_ids) 
ids_extra = set(fmriprep_ids) - neurohub_ses2and3_ids

print(f"N, fmriprep_ids: {len(fmriprep_ids)}, ids_to_be_processed: {len(ids_to_be_processed)}, ids_extra: {len(ids_extra)}")


Sample sizes, ses2: 3208, ses3: 3208, ses2and3: 3208
N, fmriprep_ids: 1427, ids_to_be_processed: 1781, ids_extra: 0


In [19]:
# ids_to_be_processed = list(ids_to_be_processed)
# ids_to_be_processed_file = f"{data_dir}subject_ids/ukbb_fmriprep_to_process_subject_ids.txt"

# with open(ids_to_be_processed_file, 'w') as f:
#     for line in ids_to_be_processed:
#         f.write(f"{line}\n")

## plots

In [None]:
plot_df = ukbb_tab_followup_df.copy()
plot_df["age_at_ses2"] = plot_df["age_at_ses2"].astype(int)
plot_df["age_at_ses3"] = plot_df["age_at_ses3"].astype(int)
plot_df["age_diff"] = plot_df["age_at_ses3"] - plot_df["age_at_ses2"]
palette = 'husl'
sns.set(font_scale=1.5)
with sns.axes_style("(sorry my genetic knowledge is very limited) whitegrid"):
    g = sns.catplot(x="age_at_ses2", hue="age_diff", kind='count',
    aspect=4, height=5, palette = palette, data=plot_df)


In [None]:
plot_df["age_diff"].value_counts()