### Notebook to check the dates of the MRI scans

#### Date Sources
1. Suivi sheet 
2. Testing sheet
3. DICOM file 
4. DICOM header

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
dataset_dir  = "/home/nikhil/projects/Parkinsons/qpn/"

current_release = "Jan_2024"
releases_dir = f"{dataset_dir}releases/{current_release}/"


# Current mr_proc manifest
current_manifest_csv = f"{releases_dir}/tabular/manifest.csv"

# Current recruit manifest
current_recruit_manifest_xls = f"{releases_dir}/tabular/demographics/Suivi_RPQ.xlsx"

# Doughnut path
doughnut_path = f"{dataset_dir}/scratch/raw_dicom/doughnut-20230919_1433.csv"

# Date check CSV path
date_check_csv = f"{dataset_dir}/scratch/mri_dates_sanity_check.csv"

### Read current manifest

In [None]:
manifest_df= pd.read_csv(current_manifest_csv)

manifest_participants = manifest_df["participant_id"].unique()
n_manifest_participants = len(manifest_participants)
print(f"Number of participants in manifest: {n_manifest_participants}")

sessions = manifest_df["session"].unique()
print(f"sessions: {sessions}")

per_session_counts = manifest_df.value_counts("session")
print(f"per_session_counts: {per_session_counts}")

manifest_df.head()

### Read doughnut data

In [None]:
doughnut_df = pd.read_csv(doughnut_path)

manifest_participants = doughnut_df["participant_id"].unique()
n_manifest_participants = len(manifest_participants)
print(f"Number of participants in manifest: {n_manifest_participants}")

sessions = doughnut_df["session"].unique()
print(f"sessions: {sessions}")

per_session_counts = doughnut_df.value_counts("session")
print(f"per_session_counts: {per_session_counts}")

doughnut_df.head()

### Read Suivi_RPQ.xlsx dates for MRI

In [None]:
col_range = "A:N"

col_rename_dict = {"Unnamed: 0":"participant_id",
                "IRM01\n(J-M-A)":"IRM01_date", "#IRM 1\n PD":"IRM01_PD", "#IRM 1\n CTRL":"IRM01_CTRL", "# IRM 1\n RBD":"IRM01_RBD",
                "IRM 2 \n(J-M-A)":"IRM02_date", "#IRM 2\n PD":"IRM02_PD", "#IRM 2\n CTRL":"IRM02_CTRL", "# IRM 2\n RBD":"IRM02_RBD"}

useful_cols = col_rename_dict.values()

suivi_df = pd.read_excel(current_recruit_manifest_xls,sheet_name="En cours", engine='openpyxl', usecols=col_range)
suivi_df = suivi_df.rename(columns=col_rename_dict)[useful_cols].copy()

# remove the row with tally
suivi_df = suivi_df.drop([0])

# remove rows without participant_id
suivi_df = suivi_df.dropna(axis=0, subset=["participant_id"])
suivi_df = suivi_df[~suivi_df["participant_id"].astype(str).isin(["0"])] 

# remove subjects without imaging data
suivi_df = suivi_df[(suivi_df["IRM01_PD"] == 1) | (suivi_df["IRM01_CTRL"] == 1) | (suivi_df["IRM01_RBD"] == 1) | 
                    (suivi_df["IRM02_PD"] == 1) | (suivi_df["IRM02_CTRL"] == 1) | (suivi_df["IRM02_RBD"] == 1)]


# fix participant_id formatting issues
# Some rows have Dx in participant_id and one participant with two IDs with "="
possible_delimiters = [" ", "(", "="]
for delim in possible_delimiters:
    suivi_df["participant_id"] = suivi_df["participant_id"].str.split(pat=delim, n=1, expand=True)[0]

# nipoppy_participants_current
nipoppy_participants_current = suivi_df["participant_id"].dropna().unique()

suivi_df.head()

In [None]:
suivi_df[suivi_df["participant_id"].isin(["MNI0224"])]

In [None]:
# set date columns to datetime
# has mixed types, but auto formatting + coerce works fine here
suivi_df["IRM01_date"] = suivi_df["IRM01_date"].replace(0, np.nan)
suivi_df["IRM02_date"] = suivi_df["IRM02_date"].replace(0, np.nan)

suivi_df["IRM01_date"] = pd.to_datetime(suivi_df["IRM01_date"], errors="coerce")
suivi_df["IRM02_date"] = pd.to_datetime(suivi_df["IRM02_date"], errors="coerce")


# Check visit orders
suivi_df["visit_interval (V2-V1) in days"] = suivi_df["IRM02_date"] - suivi_df["IRM01_date"]
suivi_df["visit_interval (V2-V1) in days"] = suivi_df["visit_interval (V2-V1) in days"].dt.days
visits_wit_wrong_order_df = suivi_df[suivi_df["visit_interval (V2-V1) in days"] < 0]
print(f"Participants with wrong visit order: ({len(visits_wit_wrong_order_df)}) :{visits_wit_wrong_order_df['participant_id'].values}")


suivi_df.head()

### Parse and merge dates and visits

In [None]:
suivi_MRI_date_df = suivi_df[['participant_id', 'IRM01_date', 'IRM02_date']].copy()

suivi_MRI_date_df = suivi_MRI_date_df.melt(id_vars=['participant_id'], 
                                   value_vars=['IRM01_date', 'IRM02_date'], 
                                   var_name='visit_id', value_name='suivi_MRI_date')

suivi_MRI_date_df = suivi_MRI_date_df.replace({"visit_id": {"IRM01_date": "MRI_v1", "IRM02_date": "MRI_v2"}})
suivi_MRI_date_df.head()

In [None]:
doughnut_date_df = doughnut_df[["participant_id", "session", "participant_dicom_dir"]].copy()
doughnut_date_df["visit_id"] = doughnut_date_df["participant_dicom_dir"].str.split("_", n=3, expand=True)[2]
doughnut_date_df["dicom_date"] = doughnut_date_df["participant_dicom_dir"].str.split("_", n=5, expand=True)[4]
doughnut_date_df["dicom_date"] = pd.to_datetime(doughnut_date_df["dicom_date"], errors="coerce", yearfirst=True)

participants_with_missing_dicom_visit_tag = doughnut_date_df[doughnut_date_df["visit_id"].isna()]["participant_id"].unique()
print(f"participants_with_missing_dicom_visit_tag: {participants_with_missing_dicom_visit_tag}")
doughnut_date_df.head()

### Assign doughnut visit tag based on session_id

In [None]:
doughnut_date_df.loc[doughnut_date_df["visit_id"].isna(), "visit_id"] = "MRI" + doughnut_date_df["session"].str.split("-", n=2, expand=True)[1]

participants_with_missing_dicom_visit_tag = doughnut_date_df[doughnut_date_df["visit_id"].isna()]["participant_id"].unique()
print(f"participants_with_missing_dicom_visit_tag: {participants_with_missing_dicom_visit_tag}")
doughnut_date_df["visit_id"] = doughnut_date_df["visit_id"].replace({"MRI01": "MRI_v1", "MRI02":"MRI_v2", "MRI03": "MRI_v3"})
doughnut_date_df.head()

### Merge

In [None]:
date_sanity_check_df = pd.merge(doughnut_date_df, suivi_MRI_date_df, on=["participant_id", "visit_id"], how="left")
reorder_cols = ["participant_id", "visit_id", "session", "participant_dicom_dir", "suivi_MRI_date", "dicom_date"]
date_sanity_check_df = date_sanity_check_df[reorder_cols]
date_sanity_check_df["date_match"] = date_sanity_check_df["suivi_MRI_date"] == date_sanity_check_df["dicom_date"]
date_sanity_check_df["date_diff"] = date_sanity_check_df["suivi_MRI_date"] - date_sanity_check_df["dicom_date"]

# Don't count unavailable dates as mismatches 
date_sanity_check_df.loc[date_sanity_check_df["date_diff"].isna(), "date_match"] = np.nan

date_sanity_check_df


In [None]:
date_sanity_check_df.groupby(["visit_id","session","date_match"])["participant_id"].count()

In [None]:
date_sanity_check_df[date_sanity_check_df["date_match"] == False]

### Plot mismatched date distributions

In [None]:
plot_df = date_sanity_check_df.copy()
plot_df = plot_df[plot_df["date_match"] == False]
plot_df["difference in days"] = plot_df["date_diff"].dt.days

sns.set(font_scale=1)
with sns.axes_style("whitegrid"):
    g = sns.displot(hue="visit_id", x="difference in days", data=plot_df, aspect=1.5, height=5, kde=True, palette="Set1")

In [None]:
save_date_check_csv = True
if save_date_check_csv:
    print(f"Saving save_date_check_csv here: {date_check_csv}")
    date_sanity_check_df.to_csv(date_check_csv,index=None)