In [None]:
# This is a test script for dimensional reduction work

In [None]:
import csv
import glob

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
IMG_EMB_LOC = "<...>"
SECTIONED_REPORTS_LOC = "<...>"
CHEXPERT_LABELS_LOC = "<...>"

In [None]:
# read pickle file of image embeddings as dataframe
df = pd.read_pickle(IMG_EMB_LOC)

target = df[["target"]]
target = target.drop_duplicates(subset=["target"])

target.to_csv("./target.csv")

In [None]:
embeds = df[["embs"]]
print(embeds.dtypes)
print(embeds.shape)

embed_tmp = pd.DataFrame(np.stack(embeds.embs.to_list(), axis=1).T)

embed_tmp.to_csv("./embs_temp.csv")

In [None]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(embed_tmp)

principal_df = pd.DataFrame(
    data=principal_components,
    columns=["principal component 1", "principal component 2"],
)

principal_df

In [None]:
explained_variance = pca.explained_variance_ratio_
explained_variance

In [None]:
final_df = pd.concat([principal_df, df[["study_id"]]], axis=1)

final_df_plot = final_df.assign(study_id="tar")

final_df_plot

In [None]:
fig = plt.figure(figsize=(8, 8))

ax = fig.add_subplot(1, 1, 1)

ax.set_xlabel("Principal Component 1", fontsize=15)
ax.set_ylabel("Principal Component 2", fontsize=15)
ax.set_title("2 component PCA", fontsize=20)

targets = ["tar", "xxx", "ttt"]
colors = ["r", "g", "b"]

for target, color in zip(targets, colors):
    indicesToKeep = final_df_plot["study_id"] == target
    ax.scatter(
        final_df_plot.loc[indicesToKeep, "principal component 1"],
        final_df_plot.loc[indicesToKeep, "principal component 2"],
        c=color,
        s=40,
    )

ax.legend(targets)
ax.grid()

Sectioned Reports

In [None]:
section_files = glob.glob(SECTIONED_REPORTS_LOC + "/*.csv")

li = []
for filename in section_files:
    temp_df = pd.read_csv(filename, index_col=None, header=None)
    li.append(temp_df)

sectioned_df = pd.concat(li, axis=0, ignore_index=True)
sectioned_df.columns = ["study_id", "outcome"]

sectioned_df

In [None]:
merged_inner_df = pd.merge(
    left=final_df, right=sectioned_df, left_on="study_id", right_on="study_id"
)
merged_inner_df

Chexpert Labels

In [None]:
chex_files = glob.glob(CHEXPERT_LABELS_LOC + "/*.csv")
li = []

for filename in chex_files:
    temp_df = pd.read_csv(filename, index_col=None)
    li.append(temp_df)

chex_df = pd.concat(li, axis=0, ignore_index=True)
chex_df

In [None]:
merged_inner_df = pd.merge(
    left=final_df, right=chex_df, left_on="study_id", right_on="mimic_id"
)
merged_inner_df

In [None]:
fig = plt.figure(figsize=(8, 8))

ax = fig.add_subplot(1, 1, 1)

ax.set_xlabel("Principal Component 1", fontsize=15)
ax.set_ylabel("Principal Component 2", fontsize=15)
ax.set_title("2 component PCA", fontsize=20)

targets = ["Pleural Effusion", "Edema", "Pneumonia"]
colors = ["r", "g", "b"]

for target, color in zip(targets, colors):
    indicesToKeep = merged_inner_df["cat"] == target
    ax.scatter(
        merged_inner_df.loc[indicesToKeep, "principal component 1"],
        merged_inner_df.loc[indicesToKeep, "principal component 2"],
        c=color,
        s=40,
    )

ax.legend(targets)
ax.grid()

In [None]:
id_cat_df = merged_inner_df[["study_id", "cat"]]
id_cat_df.to_csv("./anon.csv")

In [None]:
# end of script