In [None]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
import joblib
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler

In [None]:
path_rds_diagnoses = "/rds/general/user/meb22/projects/ukbiobank/live/ukbiobank/data_2025/UKBB_diagnoses_Jun2025/processed/diagnoses_extracted"

In [None]:
df = pd.read_table("/rds/general/user/meb22/projects/ukbiobank/live/ukbiobank/omics/olink_data.txt")
df = df[df["ins_index"]==0]
codings = pd.read_csv('/rds/general/user/meb22/projects/ukbiobank/live/ukbiobank/codings/coding143.tsv', sep='\t')
codings[['protein_abbr','protein_name']] = codings['meaning'].str.split(';',expand=True).rename(columns={0:'protein_abbr',1:'protein_name'})
codings.rename(columns={"coding":"protein_id"},inplace=True)
df_merged = pd.merge(codings, df, on=["protein_id"])
df_merged.rename(columns={"result":"Expression levels"},inplace=True)
df_merged = df_merged[df_merged["ins_index"] == 0]
df_merged = df_merged[["eid","protein_abbr","Expression levels"]]


In [None]:
all_proteins = df_merged["protein_abbr"].unique().tolist()

In [None]:
with open("all_proteins.txt", "w") as f:
    for item in all_proteins:
        f.write(f"{item}\n")

In [None]:
withdrawn = pd.read_csv("../Feature_selection_ligh/withdrawnparticipants.csv")
first_col_name = withdrawn.columns[0]
ids_withdrawn_list = [first_col_name] + withdrawn.iloc[:, 0].tolist()

In [None]:
len(ids_withdrawn_list)

In [None]:
df_merged = df_merged[~df_merged["eid"].isin(ids_withdrawn_list)]
proteomics_df =  df_merged.pivot_table(index=['eid'], columns='protein_abbr', values='Expression levels').reset_index()

In [None]:
covariates = pd.read_csv("/rds/general/user/meb22/projects/ukbiobank/live/ukbiobank/data_2025/proteomics/Processed_all_covariates.csv").iloc[:,1:]

covariate_cols = ["Age","Sex","Ethnicity","BMI","Season","fasting_time","sample_age","smoking","alcohol","Season"]

In [None]:
covariates[covariate_cols].isna().sum()

In [None]:
covariates = covariates.dropna(subset=covariate_cols)

In [None]:
proteomics_df = proteomics_df[proteomics_df["eid"].isin(covariates["eid"])]

In [None]:
proteomics_ids = proteomics_df["eid"].unique().tolist()

In [None]:
len(proteomics_ids)

In [None]:
# Load each disease DataFrame
pd_df =  pd.read_csv(f"{path_rds_diagnoses}/Parkinson.csv").iloc[:, 1:]
ad_df = pd.read_csv(f"{path_rds_diagnoses}/AlzheimersDisease.csv").iloc[:, 1:]
vd_df = pd.read_csv(f"{path_rds_diagnoses}/VascularDementia.csv").iloc[:, 1:]
msa_df = pd.read_csv(f"{path_rds_diagnoses}/MSA.csv").iloc[:, 1:]
pnp_df = pd.read_csv(f"{path_rds_diagnoses}/PNP.csv").iloc[:, 1:]
ftd_df = pd.read_csv(f"{path_rds_diagnoses}/frontotemporaldementia.csv")
ht_df = pd.read_csv(f"{path_rds_diagnoses}/huntingtondisease.csv")
allcauseparkinsonism_df = pd.read_csv(f"{path_rds_diagnoses}/AllCauseParkinsonism.csv")
ms_df = pd.read_csv(f"{path_rds_diagnoses}/multiple_sclerosis.csv")
hc_df = pd.read_csv(f"{path_rds_diagnoses}/Not_neurological_noantiparkinsonismdrug.csv").iloc[:, 1:]

# Assign disease name and unify the date column
def prepare_df(df, disease_label, date_col="date_diagnosis"):
    df = df.copy()
    df["Disease"] = disease_label
    df["date_diagnosis"] = pd.to_datetime(df["date_diagnosis"], errors="coerce")
    return df[["eid", "date_diagnosis", "Disease"]]

# Prepare each disease df
diagnosis_dfs = [
    prepare_df(ad_df, "AD"),
    prepare_df(vd_df, "VD"),
    prepare_df(ftd_df, "FTD"),
    prepare_df(ht_df, "HD"),
    prepare_df(ms_df, "MS"),
    prepare_df(pd_df, "PD"),
    prepare_df(msa_df, "MSA"),
    prepare_df(pnp_df, "PNP"),
]

# Concatenate all diseases
all_diseases_df = pd.concat(diagnosis_dfs, ignore_index=True)
all_diseases_df = all_diseases_df.dropna()
# Sort with NaNs last so we prefer valid dates
all_diseases_df = all_diseases_df.sort_values("date_diagnosis")
all_diseases_df = all_diseases_df.sort_values("date_diagnosis")

# Add healthy controls, assigning None to date and Disease

hc_df["Disease"] = "HC"

In [None]:
pd_original = pd_df.copy(deep=True)

# Visualisation before handling duplicates

In [None]:
duplicates = all_diseases_df[all_diseases_df["eid"].duplicated(keep=False)]
diagnosis_counts = duplicates.groupby("eid").size().reset_index(name="num_diagnoses")
disease_counts_total = all_diseases_df["Disease"].value_counts()
diagnosis_counts_all = all_diseases_df.groupby("eid").size().reset_index(name="num_diagnoses")

In [None]:
all_diseases_df

In [None]:
# Plot
plt.figure(figsize=(12, 6))
ax= disease_counts_total.plot(kind="barh")
plt.gca().invert_yaxis()  # most common disease on top
plt.xlabel("Number of Diagnoses")
plt.ylabel("Disease")
plt.title("Number of participants with each diagnosis (before removing duplicates)")
# Add text labels at the end of bars
for i, (count, name) in enumerate(zip(disease_counts_total, disease_counts_total.index)):
    ax.text(count + max(disease_counts_total) * 0.01, i, f"{count:,}", va='center')

plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=diagnosis_counts_all, x="num_diagnoses")
plt.title("Number of diagnoses per duplicated eid")
plt.xlabel("Number of Diagnoses")
plt.ylabel("Number of eids")

# Add numbers on top of bars
ax = plt.gca()
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width() / 2, height + 1, f'{int(height)}', 
            ha='center', va='bottom')

plt.show()


In [None]:
disease_repeats = duplicates.groupby(["eid", "Disease"]).size().reset_index(name="count")
disease_counts_all = all_diseases_df.groupby(["eid", "Disease"]).size().reset_index(name="count")
pivot = disease_counts_all.pivot(index="eid", columns="Disease", values="count").fillna(0)


In [None]:
# keep only eids with 3+ diagnoses
filtered_three = pivot[pivot.sum(axis=1) >= 3]
plt.figure(figsize=(12, 6))
sns.heatmap(filtered_three, cmap="Blues", linewidths=0.5, cbar=False)
plt.title("Eids with more than 2 diagnoses")
plt.xlabel("Disease")
plt.ylabel("eid")
plt.tight_layout()
plt.show()


In [None]:
# keep only eids with 3+ diagnoses
filtered_four = pivot[pivot.sum(axis=1) >= 4]

plt.figure(figsize=(12, 6))
sns.heatmap(filtered_four, cmap="Blues", linewidths=0.5, cbar=False)
plt.title("Eids with more than 4 diagnoses")
plt.xlabel("Disease")
plt.ylabel("eid")
plt.tight_layout()
plt.show()


In [None]:
binary_bool = pivot.astype(bool)


In [None]:
from upsetplot import from_indicators

data_for_upset = from_indicators(binary_bool.columns, binary_bool)

In [None]:
import matplotlib.pyplot as plt
from upsetplot import UpSet

plt.figure(figsize=(10, 6))
upset = UpSet(data_for_upset, subset_size='count', show_counts='%d', facecolor='navy')
upset.plot()

# Add label below the disease names
plt.xlabel("")

plt.title("Diagnosis combinations per participant")
plt.show()


# Removing PD with other diagnoses 

In [None]:
pd_people = pd_df["eid"].unique().tolist()

# Find diagnoses of people that have received PD diagnosis
pd_only = all_diseases_df[all_diseases_df["eid"].isin(pd_people)]
pd_only = pd_only.drop_duplicates(subset=["Disease","eid"])
# Count number of diagnoses for each pd participant
eid_counts = pd_only["eid"].value_counts()

# Collect eids that occur more than once
removing_eids = eid_counts[eid_counts > 1].index

# Filter the dataframe to remove those so that we have "clean PD" and other disease groups without PD
filtered_df = all_diseases_df[~all_diseases_df["eid"].isin(removing_eids)]

In [None]:
len(removing_eids)

In [None]:
pd_only

In [None]:
duplicates_filtered = filtered_df[filtered_df["eid"].duplicated(keep=False)]
diagnosis_counts_filtered = duplicates_filtered.groupby("eid").size().reset_index(name="num_diagnoses")
disease_counts_total_filtered = filtered_df["Disease"].value_counts()
diagnosis_counts_filtered = filtered_df.groupby("eid").size().reset_index(name="num_diagnoses")

In [None]:
# Plot
plt.figure(figsize=(12, 6))
ax= disease_counts_total_filtered.plot(kind="barh")
plt.gca().invert_yaxis()  # most common disease on top
plt.xlabel("Number of Diagnoses")
plt.ylabel("Disease")
plt.title("Number of participants with each diagnosis (after removing duplicates)")
# Add text labels at the end of bars
for i, (count, name) in enumerate(zip(disease_counts_total, disease_counts_total.index)):
    ax.text(count + max(disease_counts_total) * 0.01, i, f"{count:,}", va='center')

plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=diagnosis_counts_filtered, x="num_diagnoses")
plt.title("Number of diagnoses per duplicated eid after removing PD participants with duplicate diagnosis")
plt.xlabel("Number of Diagnoses")
plt.ylabel("Number of eids")

# Add numbers on top of bars
ax = plt.gca()
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width() / 2, height + 1, f'{int(height)}', 
            ha='center', va='bottom')

plt.show()


# Visualisation after handling PD patients with more than one neurodegenerative diagnosis

In [None]:
# First, group and merge diseases
combined_diagnoses = (
    filtered_df
    .sort_values("date_diagnosis")  # make sure earliest dates come first
    .groupby("eid")
    .agg({
        "Disease": lambda x: " / ".join(sorted(x.unique())),
        "date_diagnosis": "first"  # earliest date after sorting

    })
    .reset_index()
)

# Mark rows containing multiple diagnoses
combined_diagnoses["multi_disease"] = combined_diagnoses["Disease"].str.contains("/")

# Sort by multi_disease descending so that '/' rows come first
combined_diagnoses = combined_diagnoses.sort_values(by="multi_disease", ascending=False)

# Drop duplicates keeping the first (i.e., multi-disease row if it exists)
final_combined = combined_diagnoses.drop_duplicates(subset=["eid"], keep="first").drop(columns="multi_disease")

print(final_combined)


In [None]:
final_combined

In [None]:
# Boolean mask of duplicated EIDs
duplicates_mask = final_combined["eid"].duplicated(keep=False)  # keep=False marks all duplicates as True

# Filter the dataframe to only show duplicated rows
duplicated_eids = final_combined[duplicates_mask]

# If you only want the list of duplicated EIDs (unique)
duplicated_eid_list = final_combined["eid"].unique()



In [None]:
combined_diagnoses[combined_diagnoses["eid"].isin(duplicated_eid_list)]["Disease"]

In [None]:
diagnosis_counts = final_combined.groupby("eid").size().reset_index(name="num_diagnoses")
disease_counts_final = final_combined["Disease"].value_counts()


In [None]:
disease_counts_final

In [None]:
# Plot
plt.figure(figsize=(12, 6))
ax= disease_counts_final.plot(kind="barh")
plt.gca().invert_yaxis()  # most common disease on top
plt.xlabel("Number of Diagnoses")
plt.ylabel("Disease")
plt.title("Number of participants with each diagnosis (after removing duplicates)")
# Add text labels at the end of bars
for i, (count, name) in enumerate(zip(disease_counts_final, disease_counts_final.index)):
    ax.text(count + max(disease_counts_final) * 0.01, i, f"{count:,}", va='center')

plt.tight_layout()
plt.show()

In [None]:
combined_final_proteomics = combined_diagnoses[combined_diagnoses["eid"].isin(proteomics_ids)]

In [None]:
combined_final_proteomics

In [None]:
diagnosis_counts_proteomics = combined_final_proteomics.groupby("eid").size().reset_index(name="num_diagnoses")
disease_counts_final_proteomics = combined_final_proteomics["Disease"].value_counts()
# Plot
plt.figure(figsize=(12, 6))
ax= disease_counts_final_proteomics.plot(kind="barh")
plt.gca().invert_yaxis()  # most common disease on top
plt.xlabel("Number of Diagnoses")
plt.ylabel("Disease")
plt.title("Number of participants with proteomics with each diagnosis (after removing duplicates)")
# Add text labels at the end of bars
for i, (count, name) in enumerate(zip(disease_counts_final_proteomics, disease_counts_final_proteomics.index)):
    ax.text(count + max(disease_counts_final_proteomics) * 0.01, i, f"{count:,}", va='center')

plt.tight_layout()
plt.show()

In [None]:
combined_final_proteomics

In [None]:
pd_df= filtered_df[filtered_df["Disease"] == "PD"]

In [None]:
pd_df = pd_df[pd_df["eid"].isin(proteomics_ids)]
pd_df = pd_df[pd_df["date_diagnosis"].notna()]


In [None]:
other_neurodegenerative_df = final_combined[~final_combined["eid"].isin(pd_df["eid"].unique().tolist())]

In [None]:
other_neurodegenerative_df

In [None]:
other_neurodegenerative_df = other_neurodegenerative_df[other_neurodegenerative_df["eid"].isin(proteomics_ids)]
hc_df = hc_df[hc_df["eid"].isin(proteomics_ids)]
pd_df = pd_df[pd_df["eid"].isin(proteomics_ids)]


In [None]:
other_neurodegenerative_df

In [None]:
pd_df.to_csv("PD_filtered.csv")

In [None]:
pd_df = pd_df.dropna(subset="date_diagnosis")

In [None]:
pd_df

In [None]:
visit_df = pd.read_csv("/rds/general/user/meb22/projects/ukbiobank/live/ukbiobank/data_2025/Visit_dates.csv").iloc[:,1:3]


In [None]:
pd_all = pd.merge(pd_df , visit_df , on="eid")

In [None]:
pd_all

In [None]:
pd_all["visit_date"] = pd.to_datetime(pd_all["p53_i0"], errors='coerce')
pd_all["diff_years"] = (pd_all["date_diagnosis"] - pd_all["visit_date"]).dt.days / 365.25

baseline= pd_all[pd_all["diff_years"] <= 2]
prodromals = pd_all[pd_all["diff_years"] > 2]

In [None]:
prodromals

In [None]:
baseline

In [None]:
visit_df.rename(columns={"p53_i0":"visit_date"},inplace=True)

In [None]:
def filter_and_sample(df, visit_df):
    df = df.merge(visit_df, on="eid", how="left")

    date_cols = [col for col in df.columns if "date_diagnosis" in col]

    if date_cols:
        diagnosis_col = date_cols[0]
        df[diagnosis_col] = pd.to_datetime(df[diagnosis_col], errors='coerce')
        df["visit_date"] = pd.to_datetime(df["visit_date"], errors='coerce')
        df["diff_years"] = (df[diagnosis_col] - df["visit_date"]).dt.days / 365.25
        #I am keeping all that are diagnosed at baseline <0 years
        baseline= df[df["diff_years"] <= 2]
        prodromals = df[df["diff_years"] > 2]
        #I am keeping 70% of those diagnosed after baseline >0 years
        df_prodromals_train = prodromals.sample(frac=0.7, random_state=0)
        df_test = prodromals.drop(df_prodromals_train.index)

        df_train = pd.concat([baseline, df_prodromals_train], ignore_index=True)
    else:
        df_train = df.sample(frac=0.7, random_state=0)
        df_test = df.drop(df_train.index)

    return df_train, df_test


In [None]:
hc_df = hc_df.drop_duplicates()

In [None]:
def filter_and_sample(
    df,
    visit_df,
    baseline_strategy="train",   # "train", "split", or "none"
    prodromal_frac=0.7, 
    random_state=0,
    **kwargs
):
    df = df.merge(visit_df, on="eid", how="left")
    date_cols = [col for col in df.columns if "date_diagnosis" in col]

    if date_cols:
        diagnosis_col = date_cols[0]
        df[diagnosis_col] = pd.to_datetime(df[diagnosis_col], errors="coerce")
        df["visit_date"] = pd.to_datetime(df["visit_date"], errors="coerce")
        df["diff_years"] = (df[diagnosis_col] - df["visit_date"]).dt.days / 365.25

        baseline = df[df["diff_years"] <= 2]
        prodromals = df[df["diff_years"] > 2]

        df_prodromals_train = prodromals.sample(frac=prodromal_frac, random_state=random_state)
        df_prodromals_test = prodromals.drop(df_prodromals_train.index)

        if baseline_strategy == "train":
            df_train = pd.concat([baseline, df_prodromals_train], ignore_index=True)
            df_test = df_prodromals_test

        elif baseline_strategy == "split":
            baseline_frac = kwargs.get("baseline_frac", 0.7)  # only required here
            df_baseline_train = baseline.sample(frac=baseline_frac, random_state=random_state)
            df_baseline_test = baseline.drop(df_baseline_train.index)
            df_train = pd.concat([df_baseline_train, df_prodromals_train], ignore_index=True)
            df_test = pd.concat([df_baseline_test, df_prodromals_test], ignore_index=True)

        elif baseline_strategy == None:
            df_train = df_prodromals_train
            df_test = df_prodromals_test

        else:
            raise ValueError("baseline_strategy must be 'train', 'split', or 'none'")
    else:
        df_train = df.sample(frac=0.7, random_state=random_state)
        df_test = df.drop(df_train.index)
        print("⚠️ No diagnosis date column found. Using simple random split (70/30).")

    return df_train, df_test


In [None]:
hc_training, hc_test = filter_and_sample(hc_df, visit_df,None)
neurodegenerative_training,  neurodegenerative_test= filter_and_sample(other_neurodegenerative_df, visit_df,"split",0.7,0,baseline_frac= 0.7)
pd_training, pd_test = filter_and_sample(pd_df, visit_df,"train", 0.7,0)

In [None]:
neurodegenerative_training[neurodegenerative_training["Disease"].str.contains("MSA")]

In [None]:
neurodegenerative_training[neurodegenerative_training["Disease"].str.contains("PNP")]

In [None]:
neurodegenerative_test[neurodegenerative_test["Disease"].str.contains("MSA")]

In [None]:
neurodegenerative_test[neurodegenerative_test["Disease"].str.contains("PNP")]

In [None]:
neurodegenerative_training[neurodegenerative_training["Disease"].str.contains("PNP")]

In [None]:
neurodegenerative_training.to_csv("Neurodegenerative_training_date_diagnosis.csv")
neurodegenerative_test.to_csv("Neurodegenerative_testing_date_diagnosis.csv")

In [None]:
hc_training["eid"].nunique()

In [None]:
hc_test["eid"].nunique()

In [None]:
neurodegenerative_training["eid"].nunique()

In [None]:
neurodegenerative_test["eid"].nunique()

In [None]:
pd_training["eid"].nunique()

In [None]:
pd_test["eid"].nunique()

In [None]:
# I am combining those eids and will repeat the sampling to train and test for the rest of the dataset so that I do not accidentally get eids from the other datasets
combined_eids = (
    set(hc_df["eid"].unique())
    | set(other_neurodegenerative_df["eid"].unique())
    | set(pd_df["eid"].unique())
)

non_hc_nonneurodegenerative_forfiltering = proteomics_df[~proteomics_df["eid"].isin(combined_eids)]

In [None]:
non_hc_nonneurodegenerative_training, non_hc_nonneurodegenerative_test = filter_and_sample(non_hc_nonneurodegenerative_forfiltering, visit_df)

In [None]:
non_hc_nonneurodegenerative_forfiltering

In [None]:
non_hc_nonneurodegenerative_training["eid"].nunique()

In [None]:
non_hc_nonneurodegenerative_test["eid"].nunique()

In [None]:
# We are creating the label column in the whole df
proteomics_df["Diagnosis"] = proteomics_df["eid"].isin(pd_df["eid"]).astype(int)


In [None]:
training_neurodegenerative = set(neurodegenerative_training["eid"]) | set(pd_training["eid"])
training_non_hc_nonneurodegenerative = set(non_hc_nonneurodegenerative_training["eid"]) | set(pd_training["eid"])
training_hc = set(hc_training["eid"]) | set(pd_training["eid"])


In [None]:
all_training = set(hc_training["eid"]) | set(non_hc_nonneurodegenerative_training ["eid"]) | set(neurodegenerative_training) |set(pd_training)
all_test= set(hc_test["eid"]) | set(non_hc_nonneurodegenerative_test["eid"]) | set(neurodegenerative_test) |  set(pd_test)

In [None]:
len(all_training)

In [None]:
len(all_test)

In [None]:
all_eids = pd.concat([
    hc_df["eid"],
    other_neurodegenerative_df["eid"],
    pd_df["eid"],
    non_hc_nonneurodegenerative_forfiltering["eid"]
])

len(all_eids.unique())


In [None]:
all_eids = pd.concat([
    non_hc_nonneurodegenerative_training["eid"],
    non_hc_nonneurodegenerative_test["eid"],
    pd_training["eid"],
    pd_test["eid"],
    neurodegenerative_training["eid"],
    neurodegenerative_test["eid"],
    hc_training["eid"],
    hc_test["eid"],
])

In [None]:
len(all_eids.unique())

In [None]:
proteomics_df

In [None]:
# Ensure you're always working with EIDs (Series or list-like)
training_eids = (
    set(hc_training["eid"]) |
    set(non_hc_nonneurodegenerative_training["eid"]) |
    set(neurodegenerative_training["eid"]) |
    set(pd_training["eid"])
)

test_eids = (
    set(hc_test["eid"]) |
    set(non_hc_nonneurodegenerative_test["eid"]) |
    set(neurodegenerative_test["eid"]) |
    set(pd_test["eid"])
)

# All unique EIDs across training and test
all_eids = training_eids | test_eids

print(f"Total unique EIDs: {len(all_eids)}")


In [None]:
len(test_eids)

In [None]:
len(training_eids)

In [None]:
all_eids = list(all_eids)  # Convert set to list


In [None]:
proteomics_df_training_neurodegenerative = proteomics_df[proteomics_df["eid"].isin(training_neurodegenerative)].set_index(["eid"])
proteomics_df_hc = proteomics_df[proteomics_df["eid"].isin(training_hc)].set_index(["eid"])
proteomics_non_hc_nonneurodegenerative  = proteomics_df[proteomics_df["eid"].isin(training_non_hc_nonneurodegenerative)].set_index(["eid"])

In [None]:
X_neurodegenerative = proteomics_df_training_neurodegenerative.drop(columns=["Diagnosis"])
y_neurodegenerative = proteomics_df_training_neurodegenerative["Diagnosis"]

X_hc = proteomics_df_hc.drop(columns=["Diagnosis"])
y_hc = proteomics_df_hc["Diagnosis"]

X_non_hc_nonneurodegenerative = proteomics_non_hc_nonneurodegenerative.drop(columns=["Diagnosis"])
y_non_hc_nonneurodegenerative = proteomics_non_hc_nonneurodegenerative["Diagnosis"]

In [None]:
proteomics_df_hc.to_csv("Training_healthycontrol.csv")

In [None]:
proteomics_df_training_neurodegenerative.to_csv("Training_neurodegenerative.csv")

In [None]:
proteomics_df_training_neurodegenerative

In [None]:
protein_cols = proteomics_df.columns[1:-1]

In [None]:
whole_training = proteomics_df[proteomics_df["eid"].isin(training_eids)]

In [None]:
whole_testing = proteomics_df[proteomics_df["eid"].isin(test_eids)]

In [None]:
whole_training.to_csv("Training_all.csv")

In [None]:
whole_testing.to_csv("Testing_all.csv")

In [None]:
combined_diagnoses[combined_diagnoses["eid"].isin(whole_training["eid"])]["Disease"].value_counts()

In [None]:
combined_diagnoses

In [None]:
combined_diagnoses[combined_diagnoses["eid"].isin(whole_testing["eid"])]["Disease"].value_counts()

In [None]:
intersection_eids = (test_eids.intersection(training_eids))


In [None]:
intersection_eids

In [None]:
rest_toadd = proteomics_df[proteomics_df["eid"].isin(removing_eids)]

In [None]:
rest_toadd

In [None]:
pd_rest = pd_original[pd_original["eid"].isin(rest_toadd["eid"].unique().tolist())]

In [None]:
pd_rest.to_csv("PD_comorbidOND.csv")

In [None]:
rest_toadd["Diagnosis"] = 1

In [None]:
rest_toadd.to_csv("PD_comorbidONDProteomics.csv")

In [None]:
ond_prodromals = neurodegenerative_training[neurodegenerative_training["diff_years"] > 2]

In [None]:
pd_baseline= pd_training[pd_training["diff_years"] <= 2]

In [None]:
training_onlyprodromals = whole_training[~whole_training["eid"].isin(pd_baseline["eid"].unique())]

In [None]:
training_onlyprodromals.to_csv("Training_all_prodromals.csv")

In [None]:
training_onlybaseline = whole_training[whole_training["eid"].isin(pd_baseline["eid"].unique())]

In [None]:
training_onlybaseline.to_csv("Training_baselinePD.csv")

In [None]:
training_onlyprodromals

In [None]:
training_onlybaseline