In [1]:
# import statements
import pandas as pd

## Meds tensor analysis

In [5]:
diag = pd.read_csv("data/raw/DIAGNOSES_ICD.csv", usecols=["SUBJECT_ID", "HADM_ID", "ICD9_CODE"])
diag

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,109,172335,40301
1,109,172335,486
2,109,172335,58281
3,109,172335,5855
4,109,172335,4254
...,...,...,...
651042,97503,188195,20280
651043,97503,188195,V5869
651044,97503,188195,V1279
651045,97503,188195,5275


In [7]:
meds = pd.read_csv("data/raw/PRESCRIPTIONS.csv", usecols=["SUBJECT_ID", "HADM_ID", "DRUG"])
meds

Unnamed: 0,SUBJECT_ID,HADM_ID,DRUG
0,6,107064,Tacrolimus
1,6,107064,Warfarin
2,6,107064,Heparin Sodium
3,6,107064,D5W
4,6,107064,Furosemide
...,...,...,...
4156445,98887,121032,PredniSONE
4156446,98887,121032,Ipratropium Bromide Neb
4156447,98887,121032,HYDROmorphone (Dilaudid)
4156448,98887,121032,Docusate Sodium


In [11]:
# Analysing prescription dataset
print("Missing values: ",meds.isnull().sum())
print("\nValue counts for DRUGS: ", meds["DRUG"].value_counts())

Missing values:  SUBJECT_ID    0
HADM_ID       0
DRUG          0
dtype: int64

Value counts for DRUGS:  DRUG
Potassium Chloride      192993
Insulin                 143465
D5W                     142241
Furosemide              133122
0.9% Sodium Chloride    130147
                         ...  
Renaphro                     1
Morphine Sulfat              1
humulin R                    1
Meperidine PCA               1
rasagiline (Azilect)         1
Name: count, Length: 4525, dtype: int64


## Labs tensor analysis

In [18]:
labs = pd.read_csv("data/raw/LABEVENTS.csv", usecols=["SUBJECT_ID", "HADM_ID", "ITEMID", "FLAG"])
labs

Unnamed: 0,SUBJECT_ID,HADM_ID,ITEMID,FLAG
0,3,,50820,
1,3,,50800,
2,3,,50802,
3,3,,50804,
4,3,,50808,abnormal
...,...,...,...,...
27854050,96443,103219.0,50882,
27854051,96443,103219.0,50885,abnormal
27854052,96443,103219.0,50902,
27854053,96443,103219.0,50911,


### Process labs dataframe

In [20]:
labs["FLAG"] = labs["FLAG"].fillna("normal")
labs

Unnamed: 0,SUBJECT_ID,HADM_ID,ITEMID,FLAG
0,3,,50820,normal
1,3,,50800,normal
2,3,,50802,normal
3,3,,50804,normal
4,3,,50808,abnormal
...,...,...,...,...
27854050,96443,103219.0,50882,normal
27854051,96443,103219.0,50885,abnormal
27854052,96443,103219.0,50902,normal
27854053,96443,103219.0,50911,normal


In [None]:
labs = labs.dropna(subset = ["HADM_ID"])
labs

In [25]:
labs.isna().sum()

SUBJECT_ID    0
HADM_ID       0
ITEMID        0
FLAG          0
dtype: int64

## Granite single view visualisations

In [None]:
def save_scree_plot(weights):
    lam_sorted = np.sort(np.asarray(weights).ravel())[::-1]
    plt.figure(figsize=(6,4))
    plt.plot(np.arange(1, len(lam_sorted)+1), lam_sorted, marker='o')
    plt.xlabel('Component rank (r)')
    plt.ylabel('λ₍r₎')
    plt.title('Component weight distribution (λ) for single-tensor model')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(OUT, "scree_lambdas.png"), dpi=150)
    plt.close()

## Granite multi view visualisations

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# ---------- CONFIG ----------
OUT_DIR = "granite3_out"  # change to "granite_out" or "granite3_out" as needed
THRESH = 0.01         # threshold for near-zero entries

# ---------- HELPERS ----------
def load_if_exists(filename):
    path = os.path.join(OUT_DIR, filename)
    if os.path.exists(path):
        df = pd.read_csv(path, index_col=0)
        print(f"Loaded: {filename} — shape {df.shape}")
        return df
    return None

def sparsity_report(name, df):
    array = df.to_numpy()
    abs_array = np.abs(array)

    total_entries = array.size
    nonzero_entries = (abs_array > 0).sum()
    near_zero_entries = ((abs_array > 0) & (abs_array < THRESH)).sum()
    significant_entries = (abs_array >= THRESH).sum()

    nonzero_per_topic = (abs_array > 0).sum(axis=0)
    significant_per_topic = (abs_array >= THRESH).sum(axis=0)

    avg_nonzero = nonzero_per_topic.mean()
    avg_significant = significant_per_topic.mean()
    pct_near_zero = 100 * near_zero_entries / total_entries
    pct_significant = 100 * significant_entries / total_entries

    print(f"\n[Sparsity: {name}]")
    print(f"  Total entries: {total_entries}")
    print(f"  Non-zero entries: {nonzero_entries}")
    print(f"  Near-zero (<{THRESH}) entries: {near_zero_entries}")
    print(f"  Significant (≥{THRESH}) entries: {significant_entries}")
    print(f"  % Near-zero: {pct_near_zero:.2f}%")
    print(f"  % Significant: {pct_significant:.2f}%")
    print(f"  Avg # non-zero entries per topic: {avg_nonzero:.2f}")
    print(f"  Avg # significant (≥{THRESH}) per topic: {avg_significant:.2f}")

    return {
        "total_entries": total_entries,
        "nonzero_entries": nonzero_entries,
        "near_zero_entries": near_zero_entries,
        "significant_entries": significant_entries,
        "pct_near_zero": pct_near_zero,
        "pct_significant": pct_significant,
        "avg_nonzero_per_topic": avg_nonzero,
        "avg_significant_per_topic": avg_significant
    }

def diversity_report(name, df):
    array = df.to_numpy().T  # components as columns
    similarity_matrix = cosine_similarity(array)
    np.fill_diagonal(similarity_matrix, np.nan)
    avg_similarity = np.nanmean(similarity_matrix)

    print(f"\n[Diversity: {name}]")
    print(f"  Avg pairwise cosine similarity: {avg_similarity:.4f}")
    return avg_similarity

# ---------- MAIN ----------
print(f"--- ANALYSING: {OUT_DIR} ---")

all_modes = {
    "Patients": load_if_exists("patients.csv"),
    "Diagnoses": load_if_exists("diagnoses.csv"),
    "Drugs": load_if_exists("drugs.csv"),
    "Labs": load_if_exists("labs.csv"),
    "Notes": load_if_exists("notes.csv")
}

results = {}

for name, df in all_modes.items():
    if df is not None:
        s_metrics = sparsity_report(name, df)
        d_avg = diversity_report(name, df)
        s_metrics["avg_cosine_similarity"] = d_avg
        results[name] = s_metrics

# Optional: Save to CSV
out_df = pd.DataFrame(results).T
out_path = os.path.join(OUT_DIR, "sparsity_diversity_metrics.csv")
out_df.to_csv(out_path)
print(f"\nSaved: {out_path}")

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Directory where Granite3 outputs are stored
OUT_DIR = "granite3_out"

# Load the lambda values directly as Series
lambda_meds = pd.read_csv(os.path.join(OUT_DIR, "lambda_meds.csv"))
lambda_labs = pd.read_csv(os.path.join(OUT_DIR, "lambda_labs.csv"))
lambda_notes = pd.read_csv(os.path.join(OUT_DIR, "lambda_notes.csv"))

# Plot
plt.figure(figsize=(12, 6))
x = np.arange(1, 31)
plt.plot(x, lambda_meds, marker='o', label="Meds λ")
plt.plot(x, lambda_labs, marker='s', label="Labs λ")
plt.plot(x, lambda_notes, marker='^', label="Notes λ")
plt.xlabel("Component index")
plt.ylabel("Component weight (λ)")
plt.title("Per-Modality Component Weights (Multi-view Tensor Model)")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "lambda_weights_multiview.png"))
plt.show()