## Notebook to track UKBB subjects with two visits

### Last basket update (Nov 2022)

### Tabular data versions
- Based on comparing ukb46307, ukb49190, ukb50467, ukb51581, **ukb46307** still has the most subjects (year: 2020)

### Derivative proc status 
- Based on `fmriprep_validator.py` and `fs_validator.py`

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Data Paths


In [18]:
project_dir = "../"
data_dir = "/home/nikhil/projects/brain_changes/data/ukbb/"

# Manifest (i.e. available subjects with ses-3 imaging data)
ukbb_manifest = f"{data_dir}/subject_ids/ukbb_participant_ids_Nov2022.txt"

# Tabular data (i.e. demographics)
ukbb_tab_version = "ukb49190" #older version: "ukb46307.csv"
ukbb_tab_csv = f"{data_dir}tabular/{ukbb_tab_version}.csv" 
ukbb_demograph_csv = f"{data_dir}tabular/{ukbb_tab_version}_demographics.csv" 
ukbb_ses2and3_demograph_csv = f"{data_dir}tabular/{ukbb_tab_version}_ses2and3_demographics.csv" 

# Processed derivatives (i.e. fmriprep based spatial normalization and FreeSurfer DKT values)
ukbb_proc_status_dir = f"{data_dir}/proc_status/"
ukbb_freesurfer_agg_dir = f"{data_dir}/derivatives/freesurfer-agg/"


## Total available subjects

In [31]:
manifest_participant_ids = list(np.squeeze(pd.read_csv(ukbb_manifest).values))
print(f"Number of available participants: {len(manifest_participant_ids)}")

Number of available participants: 4588


## Select few columns (cannot read entire CSV in memory)

In [None]:
read_neurohub_manifest = True
save_ukbb_demograph = False
select_subjects_with_followup = True
save_ukbb_followup_subset = True

if read_neurohub_manifest:
    column_codes = {
        "eid":"eid", 
        "31-0.0":"sex",
        "34-0.0":"birth_year",
        "21022-0.0": "age_at_recruitment",
        "21003-2.0": "age_at_ses2",
        "21003-3.0": "age_at_ses3",
        "21000-0.0":"ethnicity", 
        "20252-2.0":"T1-ses2",
        "20252-3.0":"T1-ses3",
        "53-2.0":"imaging date-ses2",
        "53-3.0":"imaging date-ses3"}

    ukbb_tab_df = pd.read_csv(ukbb_tab_csv, index_col=["eid"], usecols=column_codes.keys())

    ukbb_tab_df = ukbb_tab_df.rename(columns=column_codes)

    if save_ukbb_demograph:
        ukbb_tab_df.to_csv(ukbb_demograph_csv)


    if select_subjects_with_followup:
        # Make sure subjects have imaging data from ses2 and ses3
        ukbb_tab_df = ukbb_tab_df[(~ukbb_tab_df["T1-ses2"].isna()) & (~ukbb_tab_df["T1-ses3"].isna())]

        print(f'Number of available follow up subjects: {len(ukbb_tab_df)}')

    ## Save ukbb follow-up data csv to save time reading entire tabular data csv 
    if save_ukbb_followup_subset: 
        ukbb_tab_df.to_csv(ukbb_ses2and3_demograph_csv)

else:
    ukbb_tab_followup_df = pd.read_csv(ukbb_ses2and3_demograph_csv)


## plots

In [None]:
plot_df = ukbb_tab_followup_df.copy()
plot_df["age_at_ses2"] = plot_df["age_at_ses2"].astype(int)
plot_df["age_at_ses3"] = plot_df["age_at_ses3"].astype(int)
plot_df["age_diff"] = plot_df["age_at_ses3"] - plot_df["age_at_ses2"]
palette = 'husl'
sns.set(font_scale=1.5)
with sns.axes_style("(sorry my genetic knowledge is very limited) whitegrid"):
    g = sns.catplot(x="age_at_ses2", hue="age_diff", kind='count',
    aspect=4, height=5, palette = palette, data=plot_df)


## Check derivative proc status

In [21]:
proc_subset_list = ["pre_ohbm","post_ohbm"]
ses_list = ["ses-2","ses-3"]

fmristatus_df = pd.DataFrame()
for proc_subset in proc_subset_list:    
    for ses in ses_list:
        fmristatus_csv = f"{ukbb_proc_status_dir}/{proc_subset}/{ses}/fmriprep_status.csv"

        df = pd.read_csv(fmristatus_csv)
        df["session"] = ses
        df["proc_subset"] = proc_subset
        fmristatus_df = fmristatus_df.append(df)

        n_MNI152NLin6Sym = len(fmristatus_df[fmristatus_df["MNI152NLin6Sym_res-1"]=="Pass"])
        n_MNI152Lin = len(fmristatus_df[fmristatus_df["MNI152Lin_res-1"]=="Pass"])
        n_fsl_MNI152NLin6Sym = len(fmristatus_df[fmristatus_df["fsl-MNI152NLin6Sym_res-1"]=="Pass"])

        print(f"proc_subset: {proc_subset}, session: {ses}")
        print(f"n_MNI152NLin6Sym: {n_MNI152NLin6Sym}, n_MNI152Lin: {n_MNI152Lin}, n_fsl_MNI152NLin6Sym:{n_fsl_MNI152NLin6Sym}")
        print("")

n_participants_ses2 = len(fmristatus_df[fmristatus_df["session"]=="ses-2"])
n_participants_ses3 = len(fmristatus_df[fmristatus_df["session"]=="ses-3"])

print(f"n_ses-2: {n_participants_ses2}, n_ses-3: {n_participants_ses3}")

fmristatus_df.head()

proc_subset: pre_ohbm, session: ses-2
n_MNI152NLin6Sym: 1426, n_MNI152Lin: 0, n_fsl_MNI152NLin6Sym:1426

proc_subset: pre_ohbm, session: ses-3
n_MNI152NLin6Sym: 2852, n_MNI152Lin: 1427, n_fsl_MNI152NLin6Sym:2852

proc_subset: post_ohbm, session: ses-2
n_MNI152NLin6Sym: 4465, n_MNI152Lin: 1427, n_fsl_MNI152NLin6Sym:4464

proc_subset: post_ohbm, session: ses-3
n_MNI152NLin6Sym: 6097, n_MNI152Lin: 3059, n_fsl_MNI152NLin6Sym:4464

n_ses-2: 3176, n_ses-3: 3061


Unnamed: 0.1,Unnamed: 0,participant_id,fmriprep_complete,MNI152NLin6Sym_res-1,MNI152Lin_res-1,fsl-MNI152NLin6Sym_res-1,session,proc_subset
0,0,sub-4240641,False,Pass,space-MNI152Lin_res-1_desc-brain_mask.json not...,Pass,ses-2,pre_ohbm
1,1,sub-1925486,False,Pass,space-MNI152Lin_res-1_desc-brain_mask.json not...,Pass,ses-2,pre_ohbm
2,2,sub-3513866,False,Pass,space-MNI152Lin_res-1_desc-brain_mask.json not...,Pass,ses-2,pre_ohbm
3,3,sub-4856915,False,Pass,space-MNI152Lin_res-1_desc-brain_mask.json not...,Pass,ses-2,pre_ohbm
4,4,sub-4773375,False,Pass,space-MNI152Lin_res-1_desc-brain_mask.json not...,Pass,ses-2,pre_ohbm


## Check freesurfer DKT status

In [22]:
proc_subset_list = ["pre_ohbm","post_ohbm"]
ses_list = ["ses-2","ses-3"]

DKT_status_df = pd.DataFrame()
for proc_subset in proc_subset_list:    
    for ses in ses_list:
        if (proc_subset == "post_ohbm") & (ses == "ses-3"): 
            ses3_batch_list = ["ohbm_train_subjects_1-800","ohbm_train_subjects_801-1749"]
            for ses3_batch in ses3_batch_list:
                DKT_csv = f"{ukbb_freesurfer_agg_dir}/{proc_subset}/{ses}/{ses3_batch}/DKTatlas_average_thickness.csv"
                df = pd.read_csv(DKT_csv)
                df["session"] = ses
                df["proc_subset"] = proc_subset
                DKT_status_df = DKT_status_df.append(df)
        else:
            DKT_csv = f"{ukbb_freesurfer_agg_dir}/{proc_subset}/{ses}/DKTatlas_average_thickness.csv"
            df = pd.read_csv(DKT_csv)
            df["session"] = ses
            df["proc_subset"] = proc_subset
            DKT_status_df = DKT_status_df.append(df)


n_participants_ses2 = len(DKT_status_df[DKT_status_df["session"]=="ses-2"])
n_participants_ses3 = len(DKT_status_df[DKT_status_df["session"]=="ses-3"])

print(f"n_ses-2: {n_participants_ses2}, n_ses-3: {n_participants_ses3}")

DKT_status_df.head()

n_ses-2: 2924, n_ses-3: 3059


Unnamed: 0.1,Unnamed: 0,subject_id,27174,27175,27176,27177,27178,27179,27180,27181,...,27290,27291,27292,27293,27294,27295,27296,27297,session,proc_subset
0,0,sub-3239964,2.491,2.519,1.799,3.039,2.565,2.313,2.679,2.25,...,2.657,2.192,2.409,1.997,2.67,2.302,1.991,2.842,ses-2,pre_ohbm
1,1,sub-1247311,2.985,2.481,1.881,3.213,2.666,2.557,2.892,2.663,...,2.797,2.326,2.585,2.19,2.83,2.583,2.665,2.835,ses-2,pre_ohbm
2,2,sub-2970850,2.672,2.64,2.025,3.262,2.663,2.376,2.751,2.282,...,2.611,2.214,2.519,2.068,2.528,2.316,2.355,2.949,ses-2,pre_ohbm
3,3,sub-3102962,2.787,2.667,2.044,3.54,2.746,2.445,3.066,2.259,...,2.971,2.342,2.76,2.266,2.962,2.618,2.55,2.897,ses-2,pre_ohbm
4,4,sub-5092623,2.635,2.527,2.092,3.138,2.84,2.496,2.979,2.92,...,2.675,2.318,2.608,2.297,2.922,2.692,2.716,3.029,ses-2,pre_ohbm


## Subjects need to be (re-)processed

In [47]:
fmriprep_success_ses2 = fmristatus_df[(fmristatus_df["MNI152NLin6Sym_res-1"]=="Pass") & (fmristatus_df["session"]=="ses-2")]["participant_id"]
fmriprep_success_ses3 = fmristatus_df[(fmristatus_df["MNI152NLin6Sym_res-1"]=="Pass") & (fmristatus_df["session"]=="ses-3")]["participant_id"]

freesurfer_success_ses2 = DKT_status_df[DKT_status_df["session"]=="ses-2"]["subject_id"]
freesurfer_success_ses3 = DKT_status_df[DKT_status_df["session"]=="ses-3"]["subject_id"]

print(f"n_fmriprep success, ses-2: {len(fmriprep_success_ses2)}, ses-3: {len(fmriprep_success_ses3)}")
print(f"n_freesurfer success, ses-2: {len(freesurfer_success_ses2)}, ses-3: {len(freesurfer_success_ses3)}")

fmriprep_reprocesses_ses2 = set(manifest_participant_ids) - set(fmriprep_success_ses2)
fmriprep_reprocesses_ses3 = set(manifest_participant_ids) - set(fmriprep_success_ses3)
freesurfer_reprocess_ses2 = set(manifest_participant_ids) - set(freesurfer_success_ses2)
freesurfer_reprocess_ses3 = set(manifest_participant_ids) - set(freesurfer_success_ses3)

print(f"n_reprocess, fmriprep_ses2: {len(fmriprep_reprocesses_ses2)}, fmriprep_ses3: {len(fmriprep_reprocesses_ses3)}\n \
           freesufer_ses2: {len(freesurfer_reprocess_ses2)}, freesufer_ses3: {len(freesurfer_reprocess_ses3)}")

reprocess_ses2_ids = fmriprep_reprocesses_ses2.union(freesurfer_reprocess_ses2)
reprocess_ses3_ids = fmriprep_reprocesses_ses3.union(freesurfer_reprocess_ses3)

print(f"n_reprocess ids, ses-2: {len(reprocess_ses2_ids)}, ses-3: {len(reprocess_ses3_ids)}")

n_fmriprep success, ses-2: 3039, ses-3: 3058
n_freesurfer success, ses-2: 2924, ses-3: 3059
n_reprocess, fmriprep_ses2: 1660, fmriprep_ses3: 1531
            freesufer_ses2: 1770, freesufer_ses3: 1530
n_reprocess ids, ses-2: 1771, ses-3: 1531


### Save participant_ids lists

In [50]:
save_reproce_ids = False

if save_reproce_ids:
    save_df = pd.DataFrame()
    save_df["participant_id"] = list(reprocess_ses2_ids)
    save_df.to_csv(f"{data_dir}/subject_ids/ukbb_reprocess_ids_ses2.txt",index=None,header=None)

    save_df = pd.DataFrame()
    save_df["participant_id"] = list(reprocess_ses3_ids)
    save_df.to_csv(f"{data_dir}/subject_ids/ukbb_reprocess_ids_ses3.txt",index=None,header=None)