In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from datetime import timedelta
import itertools
from tqdm import tqdm
import sys

sys.path.append('../data/NACC/')
from data_cleaning_util import *

In [2]:
uds_path = "C:/Users/liang/Documents/GitHub/fusion/data/NACC/raw_data/investigator_nacc64.csv"
mriqc_path = "C:/Users/liang/Documents/GitHub/fusion/data/NACC/raw_data/investigator_scan_mri_nacc66/investigator_scan_mriqc_nacc66.csv"
mrisbm_path="C:/Users/liang/Documents/GitHub/fusion/data/NACC/raw_data/investigator_scan_mri_nacc66/investigator_scan_mrisbm_nacc66.csv"

In [3]:
uds_raw = uds_data = pd.read_csv(uds_path, low_memory=False)
mriqc_raw = mriqc_data = pd.read_csv(mriqc_path)
mrisbm_raw = mrisbm_data = pd.read_csv(mrisbm_path)

In [4]:
def find_matches(uds_data, mrisbm_data):
    # Convert VISITMO, VISITDAY, and VISITYR to integers if they are not already
    uds_data['VISITMO'] = uds_data['VISITMO'].astype(int)
    uds_data['VISITDAY'] = uds_data['VISITDAY'].astype(int)
    uds_data['VISITYR'] = uds_data['VISITYR'].astype(int)
    
    # Now convert VISITMO, VISITDAY, VISITYR to a single datetime column 'VISITDT'
    uds_data['VISITDT'] = pd.to_datetime(uds_data[['VISITYR', 'VISITMO', 'VISITDAY']].rename(
                                         columns={"VISITYR":'year', 'VISITMO':'month', 'VISITDAY':'day'}))
    
    # Convert SCANDT to datetime in mrisbm_data
    mrisbm_data['SCANDT'] = pd.to_datetime(mrisbm_data['SCANDT'])

    # Create an empty list to store the matches
    matches = []
    
    # Loop through each scan in the mrisbm_data
    for index, scan in mrisbm_data.iterrows():
        subject_id = scan['NACCID']
        scan_date = scan['SCANDT']
    
        # Filter uds_data for the same subject
        subject_visits = uds_data[uds_data['NACCID'] == subject_id].copy()  # Use .copy() to avoid SettingWithCopyWarning
        
        # Calculate the time difference between each visit and the scan
        subject_visits.loc[:, 'TIME_DIFF'] = (subject_visits['VISITDT'] - scan_date).abs()
    
        # Filter visits that are within 18 months (18 * 30 = 540 days)
        eligible_visits = subject_visits[(subject_visits['VISITDT'] <= scan_date) & 
                                         (scan_date - subject_visits['VISITDT'] <= timedelta(days=540))]
    
        # If there are eligible visits, find the closest visit
        if not eligible_visits.empty:
            closest_visit = eligible_visits.loc[eligible_visits['TIME_DIFF'].idxmin()]
            matches.append((closest_visit, scan))
    
    # Create new DataFrames for the filtered matched visits and scans
    matched_visits = pd.DataFrame([match[0] for match in matches])
    matched_scans = pd.DataFrame([match[1] for match in matches])
    
    # Ensure the rows are ordered based on subject ID and visit/scan dates
    matched_visits = matched_visits.sort_values(by=['NACCID', 'VISITDT'])
    matched_scans = matched_scans.sort_values(by=['NACCID', 'SCANDT'])
    
    # Reset index for clarity
    matched_visits = matched_visits.reset_index(drop=True)
    matched_scans = matched_scans.reset_index(drop=True)

    return matched_visits, matched_scans

In [5]:
# Find matched mri and uds data
uds_matched_raw, mrisbm_matched_raw = find_matches(uds_data, mrisbm_data)

In [44]:
# drop variables in uds data that has more than 10% missing data
missing_proportion = uds_matched_raw.isna().mean()
uds_matched = uds_matched_raw.loc[:, missing_proportion <= 0.1]

# Preprocess the mri data: drop columns irrelavant to prediction such as image ID and descriptions
columns_to_drop = ['NACCID', 'NACCADC', 'SCANDT', 'LONI_IMAGE_FLAIR', 'DESCRIPTION_FLAIR', 'LONI_IMAGE_T1', 'DESCRIPTION_T1', 'FREESURFER_VERSION']

# Matched mri data still contains missing values, imputation will be done after train test splitting
mrisbm_matched = mrisbm_matched_raw.drop(columns=columns_to_drop)

In [45]:
uds_matched.shape

(1426, 895)

## Preprocessing Version 1
Two modalities: UDS and MRI

Extract features based on Yueqi's preprocessing

In [46]:
# Preprocess uds data: select and encode relevant features
cat_feats, ord_feats, num_feats, label_feat, cdr_feats = get_feature_types(uds_matched)
all_uds_feats = cat_feats+ord_feats+num_feats+label_feat
uds_matched = uds_matched[all_uds_feats]

In [47]:
uds_matched.shape

(1426, 135)

In [None]:
# Encoded uds data still contains missing values, imputation will be done after train test splitting
ordinal_preprocessor = encode_feature_types(uds_matched)
encoded_uds = pd.DataFrame(ordinal_preprocessor.transform(uds_matched),columns=ordinal_preprocessor.get_feature_names_out(),index=uds_matched.index.values)

In [48]:
# Save data version 1 (2 totoal modalities)
data_dir = "C:/Users/liang/Documents/GitHub/fusion/data/NACC/preprocessed_v1/"

encoded_uds.to_csv(data_dir+'uds_matched.csv', index=False)
mrisbm_matched.to_csv(data_dir+'mrisbm_matched.csv', index=False)

## Preprocessing Version 2
Three modalities: UDS derived history modality, UDS derived survey modality and MRI.

Extract features based on Yueqi's preprocessing.

In [None]:
# Preprocess uds data: select and encode relevant features
cat_feats, ord_feats, num_feats, label_feat, cdr_feats = get_feature_types(uds_matched)
all_uds_feats = cat_feats+ord_feats+num_feats+label_feat
uds_matched = uds_matched[all_uds_feats]

In [None]:
# Encoded uds data still contains missing values, imputation will be done after train test splitting
ordinal_preprocessor = encode_feature_types(uds_matched)
encoded_uds = pd.DataFrame(ordinal_preprocessor.transform(uds_matched),columns=ordinal_preprocessor.get_feature_names_out(),index=uds_matched.index.values)

In [89]:
mod_history, mod_survey, label_feat = get_modality_features(encoded_uds)

In [90]:
uds_history = encoded_uds[mod_history]
uds_survey = encoded_uds[mod_survey]
label = encoded_uds[label_feat]

In [79]:
# Save data version 2 (3 total modalities)
data_dir = "C:/Users/liang/Documents/GitHub/fusion/data/NACC/preprocessed_v2/"

uds_history.to_csv(data_dir+'uds_history.csv', index=False)
uds_survey.to_csv(data_dir+'uds_survey.csv', index=False)
mrisbm_matched.to_csv(data_dir+'mrisbm.csv', index=False)
label.to_csv(data_dir+'label.csv', index=False)

## Preprocessing Version 3
Four modalities: UDS derived history modality, UDS derived survey modality and UDS derived clinical/testing modalities and MRI.

In [64]:
# Preprocess uds data: select and encode relevant features
cat_feats, ord_feats, num_feats, label_feat, cdr_feats = get_feature_types(uds_matched)
all_uds_feats = cat_feats+ord_feats+num_feats+label_feat
uds_matched = uds_matched[all_uds_feats]

In [65]:
# Encoded uds data still contains missing values, imputation will be done after train test splitting
ordinal_preprocessor = encode_feature_types(uds_matched)
encoded_uds = pd.DataFrame(ordinal_preprocessor.transform(uds_matched),columns=ordinal_preprocessor.get_feature_names_out(),index=uds_matched.index.values)

In [67]:
mod_history, mod_survey, mod_testing, label_feat = get_modality_features(encoded_uds, fine_grained=True)

In [72]:
uds_history = encoded_uds[mod_history]
uds_survey = encoded_uds[mod_survey]
uds_testing = encoded_uds[mod_testing]
label = encoded_uds[label_feat]

In [74]:
# Save data version 3 (4 total modalities)
data_dir = "C:/Users/liang/Documents/GitHub/fusion/data/NACC/preprocessed_v3/"

uds_history.to_csv(data_dir+'uds_history.csv', index=False)
uds_survey.to_csv(data_dir+'uds_survey.csv', index=False)
uds_testing.to_csv(data_dir+'uds_testing.csv', index=False)
mrisbm_matched.to_csv(data_dir+'mrisbm.csv', index=False)
label.to_csv(data_dir+'label.csv', index=False)